summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/CMakeLists.txt7
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/Makefile15
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/X86AsmLexer.cpp147
-rw-r--r--contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp865
-rw-r--r--contrib/llvm/lib/Target/X86/AsmPrinter/CMakeLists.txt9
-rw-r--r--contrib/llvm/lib/Target/X86/AsmPrinter/Makefile15
-rw-r--r--contrib/llvm/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp127
-rw-r--r--contrib/llvm/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h85
-rw-r--r--contrib/llvm/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp670
-rw-r--r--contrib/llvm/lib/Target/X86/AsmPrinter/X86AsmPrinter.h89
-rw-r--r--contrib/llvm/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp138
-rw-r--r--contrib/llvm/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h100
-rw-r--r--contrib/llvm/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp584
-rw-r--r--contrib/llvm/lib/Target/X86/AsmPrinter/X86MCInstLower.h51
-rw-r--r--contrib/llvm/lib/Target/X86/CMakeLists.txt52
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/CMakeLists.txt14
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/Makefile16
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp566
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h155
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c1388
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h515
-rw-r--r--contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h355
-rw-r--r--contrib/llvm/lib/Target/X86/Makefile25
-rw-r--r--contrib/llvm/lib/Target/X86/README-FPStack.txt85
-rw-r--r--contrib/llvm/lib/Target/X86/README-MMX.txt71
-rw-r--r--contrib/llvm/lib/Target/X86/README-SSE.txt989
-rw-r--r--contrib/llvm/lib/Target/X86/README-UNIMPLEMENTED.txt14
-rw-r--r--contrib/llvm/lib/Target/X86/README-X86-64.txt300
-rw-r--r--contrib/llvm/lib/Target/X86/README.txt1931
-rw-r--r--contrib/llvm/lib/Target/X86/SSEDomainFix.cpp506
-rw-r--r--contrib/llvm/lib/Target/X86/TargetInfo/CMakeLists.txt7
-rw-r--r--contrib/llvm/lib/Target/X86/TargetInfo/Makefile16
-rw-r--r--contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp23
-rw-r--r--contrib/llvm/lib/Target/X86/X86.h91
-rw-r--r--contrib/llvm/lib/Target/X86/X86.td212
-rw-r--r--contrib/llvm/lib/Target/X86/X86AsmBackend.cpp304
-rw-r--r--contrib/llvm/lib/Target/X86/X86COFF.h95
-rw-r--r--contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp20
-rw-r--r--contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h46
-rw-r--r--contrib/llvm/lib/Target/X86/X86CallingConv.td358
-rw-r--r--contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp887
-rw-r--r--contrib/llvm/lib/Target/X86/X86CompilationCallback_Win64.asm68
-rw-r--r--contrib/llvm/lib/Target/X86/X86ELFWriterInfo.cpp152
-rw-r--r--contrib/llvm/lib/Target/X86/X86ELFWriterInfo.h76
-rw-r--r--contrib/llvm/lib/Target/X86/X86FastISel.cpp1775
-rw-r--r--contrib/llvm/lib/Target/X86/X86FixupKinds.h26
-rw-r--r--contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp1208
-rw-r--r--contrib/llvm/lib/Target/X86/X86FloatingPointRegKill.cpp156
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp2000
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.cpp10448
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.h821
-rw-r--r--contrib/llvm/lib/Target/X86/X86Instr64bit.td2376
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrBuilder.h172
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFPStack.td698
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFormats.td400
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td62
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.cpp3786
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.h762
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrInfo.td4895
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrMMX.td677
-rw-r--r--contrib/llvm/lib/Target/X86/X86InstrSSE.td4142
-rw-r--r--contrib/llvm/lib/Target/X86/X86JITInfo.cpp566
-rw-r--r--contrib/llvm/lib/Target/X86/X86JITInfo.h81
-rw-r--r--contrib/llvm/lib/Target/X86/X86MCAsmInfo.cpp110
-rw-r--r--contrib/llvm/lib/Target/X86/X86MCAsmInfo.h38
-rw-r--r--contrib/llvm/lib/Target/X86/X86MCCodeEmitter.cpp659
-rw-r--r--contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h135
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp1573
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.h165
-rw-r--r--contrib/llvm/lib/Target/X86/X86RegisterInfo.td816
-rw-r--r--contrib/llvm/lib/Target/X86/X86Relocations.h52
-rw-r--r--contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp243
-rw-r--r--contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h59
-rw-r--r--contrib/llvm/lib/Target/X86/X86Subtarget.cpp374
-rw-r--r--contrib/llvm/lib/Target/X86/X86Subtarget.h244
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetMachine.cpp238
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetMachine.h97
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp118
-rw-r--r--contrib/llvm/lib/Target/X86/X86TargetObjectFile.h54
79 files changed, 52265 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/CMakeLists.txt b/contrib/llvm/lib/Target/X86/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000..40dbdd7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMX86AsmParser
+ X86AsmLexer.cpp
+ X86AsmParser.cpp
+ )
+add_dependencies(LLVMX86AsmParser X86CodeGenTable_gen)
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/Makefile b/contrib/llvm/lib/Target/X86/AsmParser/Makefile
new file mode 100644
index 0000000..fb97607
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/X86/AsmParser/Makefile -------------------*- Makefile -*-===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMX86AsmParser
+
+# Hack: we need to include 'main' x86 target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmLexer.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmLexer.cpp
new file mode 100644
index 0000000..a58f58e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmLexer.cpp
@@ -0,0 +1,147 @@
+//===-- X86AsmLexer.cpp - Tokenize X86 assembly to AsmTokens --------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Target/TargetAsmLexer.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "X86.h"
+
+using namespace llvm;
+
+namespace {
+
+class X86AsmLexer : public TargetAsmLexer {
+ const MCAsmInfo &AsmInfo;
+
+ bool tentativeIsValid;
+ AsmToken tentativeToken;
+
+ const AsmToken &lexTentative() {
+ tentativeToken = getLexer()->Lex();
+ tentativeIsValid = true;
+ return tentativeToken;
+ }
+
+ const AsmToken &lexDefinite() {
+ if(tentativeIsValid) {
+ tentativeIsValid = false;
+ return tentativeToken;
+ }
+ else {
+ return getLexer()->Lex();
+ }
+ }
+
+ AsmToken LexTokenATT();
+ AsmToken LexTokenIntel();
+protected:
+ AsmToken LexToken() {
+ if (!Lexer) {
+ SetError(SMLoc(), "No MCAsmLexer installed");
+ return AsmToken(AsmToken::Error, "", 0);
+ }
+
+ switch (AsmInfo.getAssemblerDialect()) {
+ default:
+ SetError(SMLoc(), "Unhandled dialect");
+ return AsmToken(AsmToken::Error, "", 0);
+ case 0:
+ return LexTokenATT();
+ case 1:
+ return LexTokenIntel();
+ }
+ }
+public:
+ X86AsmLexer(const Target &T, const MCAsmInfo &MAI)
+ : TargetAsmLexer(T), AsmInfo(MAI), tentativeIsValid(false) {
+ }
+};
+
+}
+
+static unsigned MatchRegisterName(StringRef Name);
+
+AsmToken X86AsmLexer::LexTokenATT() {
+ const AsmToken lexedToken = lexDefinite();
+
+ switch (lexedToken.getKind()) {
+ default:
+ return AsmToken(lexedToken);
+ case AsmToken::Error:
+ SetError(Lexer->getErrLoc(), Lexer->getErr());
+ return AsmToken(lexedToken);
+ case AsmToken::Percent:
+ {
+ const AsmToken &nextToken = lexTentative();
+ if (nextToken.getKind() == AsmToken::Identifier) {
+ unsigned regID = MatchRegisterName(nextToken.getString());
+
+ if (regID) {
+ lexDefinite();
+
+ StringRef regStr(lexedToken.getString().data(),
+ lexedToken.getString().size() +
+ nextToken.getString().size());
+
+ return AsmToken(AsmToken::Register,
+ regStr,
+ static_cast<int64_t>(regID));
+ }
+ else {
+ return AsmToken(lexedToken);
+ }
+ }
+ else {
+ return AsmToken(lexedToken);
+ }
+ }
+ }
+}
+
+AsmToken X86AsmLexer::LexTokenIntel() {
+ const AsmToken &lexedToken = lexDefinite();
+
+ switch(lexedToken.getKind()) {
+ default:
+ return AsmToken(lexedToken);
+ case AsmToken::Error:
+ SetError(Lexer->getErrLoc(), Lexer->getErr());
+ return AsmToken(lexedToken);
+ case AsmToken::Identifier:
+ {
+ std::string upperCase = lexedToken.getString().str();
+ std::string lowerCase = LowercaseString(upperCase);
+ StringRef lowerRef(lowerCase);
+
+ unsigned regID = MatchRegisterName(lowerRef);
+
+ if (regID) {
+ return AsmToken(AsmToken::Register,
+ lexedToken.getString(),
+ static_cast<int64_t>(regID));
+ }
+ else {
+ return AsmToken(lexedToken);
+ }
+ }
+ }
+}
+
+extern "C" void LLVMInitializeX86AsmLexer() {
+ RegisterAsmLexer<X86AsmLexer> X(TheX86_32Target);
+ RegisterAsmLexer<X86AsmLexer> Y(TheX86_64Target);
+}
+
+#define REGISTERS_ONLY
+#include "X86GenAsmMatcher.inc"
+#undef REGISTERS_ONLY
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
new file mode 100644
index 0000000..40a6a7b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -0,0 +1,865 @@
+//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetAsmParser.h"
+#include "X86.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Target/TargetAsmParser.h"
+using namespace llvm;
+
+namespace {
+struct X86Operand;
+
+class X86ATTAsmParser : public TargetAsmParser {
+ MCAsmParser &Parser;
+
+protected:
+ unsigned Is64Bit : 1;
+
+private:
+ MCAsmParser &getParser() const { return Parser; }
+
+ MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+ void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+
+ bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+
+ X86Operand *ParseOperand();
+ X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
+
+ bool ParseDirectiveWord(unsigned Size, SMLoc L);
+
+ void InstructionCleanup(MCInst &Inst);
+
+ /// @name Auto-generated Match Functions
+ /// {
+
+ bool MatchInstruction(const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+ MCInst &Inst);
+
+ bool MatchInstructionImpl(
+ const SmallVectorImpl<MCParsedAsmOperand*> &Operands, MCInst &Inst);
+
+ /// }
+
+public:
+ X86ATTAsmParser(const Target &T, MCAsmParser &_Parser)
+ : TargetAsmParser(T), Parser(_Parser) {}
+
+ virtual bool ParseInstruction(const StringRef &Name, SMLoc NameLoc,
+ SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+ virtual bool ParseDirective(AsmToken DirectiveID);
+};
+
+class X86_32ATTAsmParser : public X86ATTAsmParser {
+public:
+ X86_32ATTAsmParser(const Target &T, MCAsmParser &_Parser)
+ : X86ATTAsmParser(T, _Parser) {
+ Is64Bit = false;
+ }
+};
+
+class X86_64ATTAsmParser : public X86ATTAsmParser {
+public:
+ X86_64ATTAsmParser(const Target &T, MCAsmParser &_Parser)
+ : X86ATTAsmParser(T, _Parser) {
+ Is64Bit = true;
+ }
+};
+
+} // end anonymous namespace
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+namespace {
+
+/// X86Operand - Instances of this class represent a parsed X86 machine
+/// instruction.
+struct X86Operand : public MCParsedAsmOperand {
+ enum KindTy {
+ Token,
+ Register,
+ Immediate,
+ Memory
+ } Kind;
+
+ SMLoc StartLoc, EndLoc;
+
+ union {
+ struct {
+ const char *Data;
+ unsigned Length;
+ } Tok;
+
+ struct {
+ unsigned RegNo;
+ } Reg;
+
+ struct {
+ const MCExpr *Val;
+ } Imm;
+
+ struct {
+ unsigned SegReg;
+ const MCExpr *Disp;
+ unsigned BaseReg;
+ unsigned IndexReg;
+ unsigned Scale;
+ } Mem;
+ };
+
+ X86Operand(KindTy K, SMLoc Start, SMLoc End)
+ : Kind(K), StartLoc(Start), EndLoc(End) {}
+
+ /// getStartLoc - Get the location of the first token of this operand.
+ SMLoc getStartLoc() const { return StartLoc; }
+ /// getEndLoc - Get the location of the last token of this operand.
+ SMLoc getEndLoc() const { return EndLoc; }
+
+ StringRef getToken() const {
+ assert(Kind == Token && "Invalid access!");
+ return StringRef(Tok.Data, Tok.Length);
+ }
+ void setTokenValue(StringRef Value) {
+ assert(Kind == Token && "Invalid access!");
+ Tok.Data = Value.data();
+ Tok.Length = Value.size();
+ }
+
+ unsigned getReg() const {
+ assert(Kind == Register && "Invalid access!");
+ return Reg.RegNo;
+ }
+
+ const MCExpr *getImm() const {
+ assert(Kind == Immediate && "Invalid access!");
+ return Imm.Val;
+ }
+
+ const MCExpr *getMemDisp() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.Disp;
+ }
+ unsigned getMemSegReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.SegReg;
+ }
+ unsigned getMemBaseReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.BaseReg;
+ }
+ unsigned getMemIndexReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.IndexReg;
+ }
+ unsigned getMemScale() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.Scale;
+ }
+
+ bool isToken() const {return Kind == Token; }
+
+ bool isImm() const { return Kind == Immediate; }
+
+ bool isImmSExti16i8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ uint64_t Value = CE->getValue();
+ return (( Value <= 0x000000000000007FULL)||
+ (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)||
+ (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+ }
+ bool isImmSExti32i8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ uint64_t Value = CE->getValue();
+ return (( Value <= 0x000000000000007FULL)||
+ (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)||
+ (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+ }
+ bool isImmSExti64i8() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ uint64_t Value = CE->getValue();
+ return (( Value <= 0x000000000000007FULL)||
+ (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+ }
+ bool isImmSExti64i32() const {
+ if (!isImm())
+ return false;
+
+ // If this isn't a constant expr, just assume it fits and let relaxation
+ // handle it.
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return true;
+
+ // Otherwise, check the value is in a range that makes sense for this
+ // extension.
+ uint64_t Value = CE->getValue();
+ return (( Value <= 0x000000007FFFFFFFULL)||
+ (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+ }
+
+ bool isMem() const { return Kind == Memory; }
+
+ bool isAbsMem() const {
+ return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+ !getMemIndexReg() && getMemScale() == 1;
+ }
+
+ bool isNoSegMem() const {
+ return Kind == Memory && !getMemSegReg();
+ }
+
+ bool isReg() const { return Kind == Register; }
+
+ void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+ // Add as immediates when possible.
+ if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+ Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+ else
+ Inst.addOperand(MCOperand::CreateExpr(Expr));
+ }
+
+ void addRegOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateReg(getReg()));
+ }
+
+ void addImmOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+ addExpr(Inst, getImm());
+ }
+
+ void addMemOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 5) && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
+ Inst.addOperand(MCOperand::CreateImm(getMemScale()));
+ Inst.addOperand(MCOperand::CreateReg(getMemIndexReg()));
+ addExpr(Inst, getMemDisp());
+ Inst.addOperand(MCOperand::CreateReg(getMemSegReg()));
+ }
+
+ void addAbsMemOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 1) && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
+ }
+
+ void addNoSegMemOperands(MCInst &Inst, unsigned N) const {
+ assert((N == 4) && "Invalid number of operands!");
+ Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
+ Inst.addOperand(MCOperand::CreateImm(getMemScale()));
+ Inst.addOperand(MCOperand::CreateReg(getMemIndexReg()));
+ addExpr(Inst, getMemDisp());
+ }
+
+ static X86Operand *CreateToken(StringRef Str, SMLoc Loc) {
+ X86Operand *Res = new X86Operand(Token, Loc, Loc);
+ Res->Tok.Data = Str.data();
+ Res->Tok.Length = Str.size();
+ return Res;
+ }
+
+ static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc) {
+ X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
+ Res->Reg.RegNo = RegNo;
+ return Res;
+ }
+
+ static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){
+ X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc);
+ Res->Imm.Val = Val;
+ return Res;
+ }
+
+ /// Create an absolute memory operand.
+ static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc,
+ SMLoc EndLoc) {
+ X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+ Res->Mem.SegReg = 0;
+ Res->Mem.Disp = Disp;
+ Res->Mem.BaseReg = 0;
+ Res->Mem.IndexReg = 0;
+ Res->Mem.Scale = 1;
+ return Res;
+ }
+
+ /// Create a generalized memory operand.
+ static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp,
+ unsigned BaseReg, unsigned IndexReg,
+ unsigned Scale, SMLoc StartLoc, SMLoc EndLoc) {
+ // We should never just have a displacement, that should be parsed as an
+ // absolute memory operand.
+ assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
+
+ // The scale should always be one of {1,2,4,8}.
+ assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
+ "Invalid scale!");
+ X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+ Res->Mem.SegReg = SegReg;
+ Res->Mem.Disp = Disp;
+ Res->Mem.BaseReg = BaseReg;
+ Res->Mem.IndexReg = IndexReg;
+ Res->Mem.Scale = Scale;
+ return Res;
+ }
+};
+
+} // end anonymous namespace.
+
+
+bool X86ATTAsmParser::ParseRegister(unsigned &RegNo,
+ SMLoc &StartLoc, SMLoc &EndLoc) {
+ RegNo = 0;
+ const AsmToken &TokPercent = Parser.getTok();
+ assert(TokPercent.is(AsmToken::Percent) && "Invalid token kind!");
+ StartLoc = TokPercent.getLoc();
+ Parser.Lex(); // Eat percent token.
+
+ const AsmToken &Tok = Parser.getTok();
+ if (Tok.isNot(AsmToken::Identifier))
+ return Error(Tok.getLoc(), "invalid register name");
+
+ // FIXME: Validate register for the current architecture; we have to do
+ // validation later, so maybe there is no need for this here.
+ RegNo = MatchRegisterName(Tok.getString());
+
+ // Parse %st(1) and "%st" as "%st(0)"
+ if (RegNo == 0 && Tok.getString() == "st") {
+ RegNo = X86::ST0;
+ EndLoc = Tok.getLoc();
+ Parser.Lex(); // Eat 'st'
+
+ // Check to see if we have '(4)' after %st.
+ if (getLexer().isNot(AsmToken::LParen))
+ return false;
+ // Lex the paren.
+ getParser().Lex();
+
+ const AsmToken &IntTok = Parser.getTok();
+ if (IntTok.isNot(AsmToken::Integer))
+ return Error(IntTok.getLoc(), "expected stack index");
+ switch (IntTok.getIntVal()) {
+ case 0: RegNo = X86::ST0; break;
+ case 1: RegNo = X86::ST1; break;
+ case 2: RegNo = X86::ST2; break;
+ case 3: RegNo = X86::ST3; break;
+ case 4: RegNo = X86::ST4; break;
+ case 5: RegNo = X86::ST5; break;
+ case 6: RegNo = X86::ST6; break;
+ case 7: RegNo = X86::ST7; break;
+ default: return Error(IntTok.getLoc(), "invalid stack index");
+ }
+
+ if (getParser().Lex().isNot(AsmToken::RParen))
+ return Error(Parser.getTok().getLoc(), "expected ')'");
+
+ EndLoc = Tok.getLoc();
+ Parser.Lex(); // Eat ')'
+ return false;
+ }
+
+ if (RegNo == 0)
+ return Error(Tok.getLoc(), "invalid register name");
+
+ EndLoc = Tok.getLoc();
+ Parser.Lex(); // Eat identifier token.
+ return false;
+}
+
+X86Operand *X86ATTAsmParser::ParseOperand() {
+ switch (getLexer().getKind()) {
+ default:
+ // Parse a memory operand with no segment register.
+ return ParseMemOperand(0, Parser.getTok().getLoc());
+ case AsmToken::Percent: {
+ // Read the register.
+ unsigned RegNo;
+ SMLoc Start, End;
+ if (ParseRegister(RegNo, Start, End)) return 0;
+
+ // If this is a segment register followed by a ':', then this is the start
+ // of a memory reference, otherwise this is a normal register reference.
+ if (getLexer().isNot(AsmToken::Colon))
+ return X86Operand::CreateReg(RegNo, Start, End);
+
+
+ getParser().Lex(); // Eat the colon.
+ return ParseMemOperand(RegNo, Start);
+ }
+ case AsmToken::Dollar: {
+ // $42 -> immediate.
+ SMLoc Start = Parser.getTok().getLoc(), End;
+ Parser.Lex();
+ const MCExpr *Val;
+ if (getParser().ParseExpression(Val, End))
+ return 0;
+ return X86Operand::CreateImm(Val, Start, End);
+ }
+ }
+}
+
+/// ParseMemOperand: segment: disp(basereg, indexreg, scale). The '%ds:' prefix
+/// has already been parsed if present.
+X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
+
+ // We have to disambiguate a parenthesized expression "(4+5)" from the start
+ // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)". The
+ // only way to do this without lookahead is to eat the '(' and see what is
+ // after it.
+ const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
+ if (getLexer().isNot(AsmToken::LParen)) {
+ SMLoc ExprEnd;
+ if (getParser().ParseExpression(Disp, ExprEnd)) return 0;
+
+ // After parsing the base expression we could either have a parenthesized
+ // memory address or not. If not, return now. If so, eat the (.
+ if (getLexer().isNot(AsmToken::LParen)) {
+ // Unless we have a segment register, treat this as an immediate.
+ if (SegReg == 0)
+ return X86Operand::CreateMem(Disp, MemStart, ExprEnd);
+ return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd);
+ }
+
+ // Eat the '('.
+ Parser.Lex();
+ } else {
+ // Okay, we have a '('. We don't know if this is an expression or not, but
+ // so we have to eat the ( to see beyond it.
+ SMLoc LParenLoc = Parser.getTok().getLoc();
+ Parser.Lex(); // Eat the '('.
+
+ if (getLexer().is(AsmToken::Percent) || getLexer().is(AsmToken::Comma)) {
+ // Nothing to do here, fall into the code below with the '(' part of the
+ // memory operand consumed.
+ } else {
+ SMLoc ExprEnd;
+
+ // It must be an parenthesized expression, parse it now.
+ if (getParser().ParseParenExpression(Disp, ExprEnd))
+ return 0;
+
+ // After parsing the base expression we could either have a parenthesized
+ // memory address or not. If not, return now. If so, eat the (.
+ if (getLexer().isNot(AsmToken::LParen)) {
+ // Unless we have a segment register, treat this as an immediate.
+ if (SegReg == 0)
+ return X86Operand::CreateMem(Disp, LParenLoc, ExprEnd);
+ return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd);
+ }
+
+ // Eat the '('.
+ Parser.Lex();
+ }
+ }
+
+ // If we reached here, then we just ate the ( of the memory operand. Process
+ // the rest of the memory operand.
+ unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
+
+ if (getLexer().is(AsmToken::Percent)) {
+ SMLoc L;
+ if (ParseRegister(BaseReg, L, L)) return 0;
+ }
+
+ if (getLexer().is(AsmToken::Comma)) {
+ Parser.Lex(); // Eat the comma.
+
+ // Following the comma we should have either an index register, or a scale
+ // value. We don't support the later form, but we want to parse it
+ // correctly.
+ //
+ // Not that even though it would be completely consistent to support syntax
+ // like "1(%eax,,1)", the assembler doesn't.
+ if (getLexer().is(AsmToken::Percent)) {
+ SMLoc L;
+ if (ParseRegister(IndexReg, L, L)) return 0;
+
+ if (getLexer().isNot(AsmToken::RParen)) {
+ // Parse the scale amount:
+ // ::= ',' [scale-expression]
+ if (getLexer().isNot(AsmToken::Comma)) {
+ Error(Parser.getTok().getLoc(),
+ "expected comma in scale expression");
+ return 0;
+ }
+ Parser.Lex(); // Eat the comma.
+
+ if (getLexer().isNot(AsmToken::RParen)) {
+ SMLoc Loc = Parser.getTok().getLoc();
+
+ int64_t ScaleVal;
+ if (getParser().ParseAbsoluteExpression(ScaleVal))
+ return 0;
+
+ // Validate the scale amount.
+ if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
+ Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
+ return 0;
+ }
+ Scale = (unsigned)ScaleVal;
+ }
+ }
+ } else if (getLexer().isNot(AsmToken::RParen)) {
+ // Otherwise we have the unsupported form of a scale amount without an
+ // index.
+ SMLoc Loc = Parser.getTok().getLoc();
+
+ int64_t Value;
+ if (getParser().ParseAbsoluteExpression(Value))
+ return 0;
+
+ Error(Loc, "cannot have scale factor without index register");
+ return 0;
+ }
+ }
+
+ // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
+ if (getLexer().isNot(AsmToken::RParen)) {
+ Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
+ return 0;
+ }
+ SMLoc MemEnd = Parser.getTok().getLoc();
+ Parser.Lex(); // Eat the ')'.
+
+ return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
+ MemStart, MemEnd);
+}
+
+bool X86ATTAsmParser::
+ParseInstruction(const StringRef &Name, SMLoc NameLoc,
+ SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+ // The various flavors of pushf and popf use Requires<In32BitMode> and
+ // Requires<In64BitMode>, but the assembler doesn't yet implement that.
+ // For now, just do a manual check to prevent silent misencoding.
+ if (Is64Bit) {
+ if (Name == "popfl")
+ return Error(NameLoc, "popfl cannot be encoded in 64-bit mode");
+ else if (Name == "pushfl")
+ return Error(NameLoc, "pushfl cannot be encoded in 64-bit mode");
+ } else {
+ if (Name == "popfq")
+ return Error(NameLoc, "popfq cannot be encoded in 32-bit mode");
+ else if (Name == "pushfq")
+ return Error(NameLoc, "pushfq cannot be encoded in 32-bit mode");
+ }
+
+ // FIXME: Hack to recognize "sal..." and "rep..." for now. We need a way to
+ // represent alternative syntaxes in the .td file, without requiring
+ // instruction duplication.
+ StringRef PatchedName = StringSwitch<StringRef>(Name)
+ .Case("sal", "shl")
+ .Case("salb", "shlb")
+ .Case("sall", "shll")
+ .Case("salq", "shlq")
+ .Case("salw", "shlw")
+ .Case("repe", "rep")
+ .Case("repz", "rep")
+ .Case("repnz", "repne")
+ .Case("pushf", Is64Bit ? "pushfq" : "pushfl")
+ .Case("popf", Is64Bit ? "popfq" : "popfl")
+ .Case("retl", Is64Bit ? "retl" : "ret")
+ .Case("retq", Is64Bit ? "ret" : "retq")
+ .Case("setz", "sete")
+ .Case("setnz", "setne")
+ .Case("jz", "je")
+ .Case("jnz", "jne")
+ .Case("cmovcl", "cmovbl")
+ .Case("cmovcl", "cmovbl")
+ .Case("cmovnal", "cmovbel")
+ .Case("cmovnbl", "cmovael")
+ .Case("cmovnbel", "cmoval")
+ .Case("cmovncl", "cmovael")
+ .Case("cmovngl", "cmovlel")
+ .Case("cmovnl", "cmovgel")
+ .Case("cmovngl", "cmovlel")
+ .Case("cmovngel", "cmovll")
+ .Case("cmovnll", "cmovgel")
+ .Case("cmovnlel", "cmovgl")
+ .Case("cmovnzl", "cmovnel")
+ .Case("cmovzl", "cmovel")
+ .Default(Name);
+
+ // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
+ const MCExpr *ExtraImmOp = 0;
+ if (PatchedName.startswith("cmp") &&
+ (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
+ PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
+ unsigned SSEComparisonCode = StringSwitch<unsigned>(
+ PatchedName.slice(3, PatchedName.size() - 2))
+ .Case("eq", 0)
+ .Case("lt", 1)
+ .Case("le", 2)
+ .Case("unord", 3)
+ .Case("neq", 4)
+ .Case("nlt", 5)
+ .Case("nle", 6)
+ .Case("ord", 7)
+ .Default(~0U);
+ if (SSEComparisonCode != ~0U) {
+ ExtraImmOp = MCConstantExpr::Create(SSEComparisonCode,
+ getParser().getContext());
+ if (PatchedName.endswith("ss")) {
+ PatchedName = "cmpss";
+ } else if (PatchedName.endswith("sd")) {
+ PatchedName = "cmpsd";
+ } else if (PatchedName.endswith("ps")) {
+ PatchedName = "cmpps";
+ } else {
+ assert(PatchedName.endswith("pd") && "Unexpected mnemonic!");
+ PatchedName = "cmppd";
+ }
+ }
+ }
+ Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
+
+ if (ExtraImmOp)
+ Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc));
+
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+
+ // Parse '*' modifier.
+ if (getLexer().is(AsmToken::Star)) {
+ SMLoc Loc = Parser.getTok().getLoc();
+ Operands.push_back(X86Operand::CreateToken("*", Loc));
+ Parser.Lex(); // Eat the star.
+ }
+
+ // Read the first operand.
+ if (X86Operand *Op = ParseOperand())
+ Operands.push_back(Op);
+ else
+ return true;
+
+ while (getLexer().is(AsmToken::Comma)) {
+ Parser.Lex(); // Eat the comma.
+
+ // Parse and remember the operand.
+ if (X86Operand *Op = ParseOperand())
+ Operands.push_back(Op);
+ else
+ return true;
+ }
+ }
+
+ // FIXME: Hack to handle recognizing s{hr,ar,hl}? $1.
+ if ((Name.startswith("shr") || Name.startswith("sar") ||
+ Name.startswith("shl")) &&
+ Operands.size() == 3 &&
+ static_cast<X86Operand*>(Operands[1])->isImm() &&
+ isa<MCConstantExpr>(static_cast<X86Operand*>(Operands[1])->getImm()) &&
+ cast<MCConstantExpr>(static_cast<X86Operand*>(Operands[1])->getImm())->getValue() == 1) {
+ delete Operands[1];
+ Operands.erase(Operands.begin() + 1);
+ }
+
+ // FIXME: Hack to handle "f{mul*,add*,sub*,div*} $op, st(0)" the same as
+ // "f{mul*,add*,sub*,div*} $op"
+ if ((Name.startswith("fmul") || Name.startswith("fadd") ||
+ Name.startswith("fsub") || Name.startswith("fdiv")) &&
+ Operands.size() == 3 &&
+ static_cast<X86Operand*>(Operands[2])->isReg() &&
+ static_cast<X86Operand*>(Operands[2])->getReg() == X86::ST0) {
+ delete Operands[2];
+ Operands.erase(Operands.begin() + 2);
+ }
+
+ return false;
+}
+
+bool X86ATTAsmParser::ParseDirective(AsmToken DirectiveID) {
+ StringRef IDVal = DirectiveID.getIdentifier();
+ if (IDVal == ".word")
+ return ParseDirectiveWord(2, DirectiveID.getLoc());
+ return true;
+}
+
+/// ParseDirectiveWord
+/// ::= .word [ expression (, expression)* ]
+bool X86ATTAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+ if (getLexer().isNot(AsmToken::EndOfStatement)) {
+ for (;;) {
+ const MCExpr *Value;
+ if (getParser().ParseExpression(Value))
+ return true;
+
+ getParser().getStreamer().EmitValue(Value, Size, 0 /*addrspace*/);
+
+ if (getLexer().is(AsmToken::EndOfStatement))
+ break;
+
+ // FIXME: Improve diagnostic.
+ if (getLexer().isNot(AsmToken::Comma))
+ return Error(L, "unexpected token in directive");
+ Parser.Lex();
+ }
+ }
+
+ Parser.Lex();
+ return false;
+}
+
+/// LowerMOffset - Lower an 'moffset' form of an instruction, which just has a
+/// imm operand, to having "rm" or "mr" operands with the offset in the disp
+/// field.
+static void LowerMOffset(MCInst &Inst, unsigned Opc, unsigned RegNo,
+ bool isMR) {
+ MCOperand Disp = Inst.getOperand(0);
+
+ // Start over with an empty instruction.
+ Inst = MCInst();
+ Inst.setOpcode(Opc);
+
+ if (!isMR)
+ Inst.addOperand(MCOperand::CreateReg(RegNo));
+
+ // Add the mem operand.
+ Inst.addOperand(MCOperand::CreateReg(0)); // Segment
+ Inst.addOperand(MCOperand::CreateImm(1)); // Scale
+ Inst.addOperand(MCOperand::CreateReg(0)); // IndexReg
+ Inst.addOperand(Disp); // Displacement
+ Inst.addOperand(MCOperand::CreateReg(0)); // BaseReg
+
+ if (isMR)
+ Inst.addOperand(MCOperand::CreateReg(RegNo));
+}
+
+// FIXME: Custom X86 cleanup function to implement a temporary hack to handle
+// matching INCL/DECL correctly for x86_64. This needs to be replaced by a
+// proper mechanism for supporting (ambiguous) feature dependent instructions.
+void X86ATTAsmParser::InstructionCleanup(MCInst &Inst) {
+ if (!Is64Bit) return;
+
+ switch (Inst.getOpcode()) {
+ case X86::DEC16r: Inst.setOpcode(X86::DEC64_16r); break;
+ case X86::DEC16m: Inst.setOpcode(X86::DEC64_16m); break;
+ case X86::DEC32r: Inst.setOpcode(X86::DEC64_32r); break;
+ case X86::DEC32m: Inst.setOpcode(X86::DEC64_32m); break;
+ case X86::INC16r: Inst.setOpcode(X86::INC64_16r); break;
+ case X86::INC16m: Inst.setOpcode(X86::INC64_16m); break;
+ case X86::INC32r: Inst.setOpcode(X86::INC64_32r); break;
+ case X86::INC32m: Inst.setOpcode(X86::INC64_32m); break;
+
+ // moffset instructions are x86-32 only.
+ case X86::MOV8o8a: LowerMOffset(Inst, X86::MOV8rm , X86::AL , false); break;
+ case X86::MOV16o16a: LowerMOffset(Inst, X86::MOV16rm, X86::AX , false); break;
+ case X86::MOV32o32a: LowerMOffset(Inst, X86::MOV32rm, X86::EAX, false); break;
+ case X86::MOV8ao8: LowerMOffset(Inst, X86::MOV8mr , X86::AL , true); break;
+ case X86::MOV16ao16: LowerMOffset(Inst, X86::MOV16mr, X86::AX , true); break;
+ case X86::MOV32ao32: LowerMOffset(Inst, X86::MOV32mr, X86::EAX, true); break;
+ }
+}
+
+bool
+X86ATTAsmParser::MatchInstruction(const SmallVectorImpl<MCParsedAsmOperand*>
+ &Operands,
+ MCInst &Inst) {
+ // First, try a direct match.
+ if (!MatchInstructionImpl(Operands, Inst))
+ return false;
+
+ // Ignore anything which is obviously not a suffix match.
+ if (Operands.size() == 0)
+ return true;
+ X86Operand *Op = static_cast<X86Operand*>(Operands[0]);
+ if (!Op->isToken() || Op->getToken().size() > 15)
+ return true;
+
+ // FIXME: Ideally, we would only attempt suffix matches for things which are
+ // valid prefixes, and we could just infer the right unambiguous
+ // type. However, that requires substantially more matcher support than the
+ // following hack.
+
+ // Change the operand to point to a temporary token.
+ char Tmp[16];
+ StringRef Base = Op->getToken();
+ memcpy(Tmp, Base.data(), Base.size());
+ Op->setTokenValue(StringRef(Tmp, Base.size() + 1));
+
+ // Check for the various suffix matches.
+ Tmp[Base.size()] = 'b';
+ bool MatchB = MatchInstructionImpl(Operands, Inst);
+ Tmp[Base.size()] = 'w';
+ bool MatchW = MatchInstructionImpl(Operands, Inst);
+ Tmp[Base.size()] = 'l';
+ bool MatchL = MatchInstructionImpl(Operands, Inst);
+ Tmp[Base.size()] = 'q';
+ bool MatchQ = MatchInstructionImpl(Operands, Inst);
+
+ // Restore the old token.
+ Op->setTokenValue(Base);
+
+ // If exactly one matched, then we treat that as a successful match (and the
+ // instruction will already have been filled in correctly, since the failing
+ // matches won't have modified it).
+ if (MatchB + MatchW + MatchL + MatchQ == 3)
+ return false;
+
+ // Otherwise, the match failed.
+ return true;
+}
+
+
+extern "C" void LLVMInitializeX86AsmLexer();
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmParser() {
+ RegisterAsmParser<X86_32ATTAsmParser> X(TheX86_32Target);
+ RegisterAsmParser<X86_64ATTAsmParser> Y(TheX86_64Target);
+ LLVMInitializeX86AsmLexer();
+}
+
+#include "X86GenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/X86/AsmPrinter/CMakeLists.txt b/contrib/llvm/lib/Target/X86/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..b70a587
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmPrinter/CMakeLists.txt
@@ -0,0 +1,9 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMX86AsmPrinter
+ X86ATTInstPrinter.cpp
+ X86AsmPrinter.cpp
+ X86IntelInstPrinter.cpp
+ X86MCInstLower.cpp
+ )
+add_dependencies(LLVMX86AsmPrinter X86CodeGenTable_gen)
diff --git a/contrib/llvm/lib/Target/X86/AsmPrinter/Makefile b/contrib/llvm/lib/Target/X86/AsmPrinter/Makefile
new file mode 100644
index 0000000..c82aa33
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmPrinter/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/X86/AsmPrinter/Makefile ------------------*- Makefile -*-===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMX86AsmPrinter
+
+# Hack: we need to include 'main' x86 target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/contrib/llvm/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
new file mode 100644
index 0000000..0b64cb4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
@@ -0,0 +1,127 @@
+//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as AT&T-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86ATTInstPrinter.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+#include "X86GenInstrNames.inc"
+using namespace llvm;
+
+// Include the auto-generated portion of the assembly writer.
+#define MachineInstr MCInst
+#define GET_INSTRUCTION_NAME
+#include "X86GenAsmWriter.inc"
+#undef MachineInstr
+
+void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) {
+ printInstruction(MI, OS);
+}
+StringRef X86ATTInstPrinter::getOpcodeName(unsigned Opcode) const {
+ return getInstructionName(Opcode);
+}
+
+
+void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ switch (MI->getOperand(Op).getImm()) {
+ default: assert(0 && "Invalid ssecc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ }
+}
+
+/// print_pcrel_imm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value (e.g. for jumps and calls). These
+/// print slightly differently than normal immediates. For example, a $ is not
+/// emitted.
+void X86ATTInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm())
+ // Print this as a signed 32-bit value.
+ O << (int)Op.getImm();
+ else {
+ assert(Op.isExpr() && "unknown pcrel immediate operand");
+ O << *Op.getExpr();
+ }
+}
+
+void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ O << '%' << getRegisterName(Op.getReg());
+ } else if (Op.isImm()) {
+ O << '$' << Op.getImm();
+
+ if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
+ *CommentStream << format("imm = 0x%llX\n", (long long)Op.getImm());
+
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ O << '$' << *Op.getExpr();
+ }
+}
+
+void X86ATTInstPrinter::printLeaMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &BaseReg = MI->getOperand(Op);
+ const MCOperand &IndexReg = MI->getOperand(Op+2);
+ const MCOperand &DispSpec = MI->getOperand(Op+3);
+
+ if (DispSpec.isImm()) {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+ O << DispVal;
+ } else {
+ assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+ O << *DispSpec.getExpr();
+ }
+
+ if (IndexReg.getReg() || BaseReg.getReg()) {
+ O << '(';
+ if (BaseReg.getReg())
+ printOperand(MI, Op, O);
+
+ if (IndexReg.getReg()) {
+ O << ',';
+ printOperand(MI, Op+2, O);
+ unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+ if (ScaleVal != 1)
+ O << ',' << ScaleVal;
+ }
+ O << ')';
+ }
+}
+
+void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ // If this has a segment register, print it.
+ if (MI->getOperand(Op+4).getReg()) {
+ printOperand(MI, Op+4, O);
+ O << ':';
+ }
+ printLeaMemReference(MI, Op, O);
+}
diff --git a/contrib/llvm/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h
new file mode 100644
index 0000000..8d5d508
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h
@@ -0,0 +1,85 @@
+//===-- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to AT&T style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ATT_INST_PRINTER_H
+#define X86_ATT_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+ class MCOperand;
+
+class X86ATTInstPrinter : public MCInstPrinter {
+public:
+ X86ATTInstPrinter(const MCAsmInfo &MAI) : MCInstPrinter(MAI) {}
+
+
+ virtual void printInst(const MCInst *MI, raw_ostream &OS);
+ virtual StringRef getOpcodeName(unsigned Opcode) const;
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &OS);
+ static const char *getRegisterName(unsigned RegNo);
+ static const char *getInstructionName(unsigned Opcode);
+
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+ void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printLeaMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+ void print_pcrel_imm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+
+ void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+
+ void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printMemReference(MI, OpNo, O);
+ }
+ void printlea32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printLeaMemReference(MI, OpNo, O);
+ }
+ void printlea64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printLeaMemReference(MI, OpNo, O);
+ }
+ void printlea64_32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ printLeaMemReference(MI, OpNo, O);
+ }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
new file mode 100644
index 0000000..183213d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
@@ -0,0 +1,670 @@
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to X86 machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "X86ATTInstPrinter.h"
+#include "X86IntelInstPrinter.h"
+#include "X86MCInstLower.h"
+#include "X86.h"
+#include "X86COFF.h"
+#include "X86COFFMachineModuleInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/ADT/SmallString.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Primitive Helper Functions.
+//===----------------------------------------------------------------------===//
+
+void X86AsmPrinter::PrintPICBaseSymbol(raw_ostream &O) const {
+ const TargetLowering *TLI = TM.getTargetLowering();
+ O << *static_cast<const X86TargetLowering*>(TLI)->getPICBaseSymbol(MF,
+ OutContext);
+}
+
+/// runOnMachineFunction - Emit the function body.
+///
+bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ SetupMachineFunction(MF);
+
+ if (Subtarget->isTargetCOFF()) {
+ bool Intrn = MF.getFunction()->hasInternalLinkage();
+ OutStreamer.BeginCOFFSymbolDef(CurrentFnSym);
+ OutStreamer.EmitCOFFSymbolStorageClass(Intrn ? COFF::C_STAT : COFF::C_EXT);
+ OutStreamer.EmitCOFFSymbolType(COFF::DT_FCN << COFF::N_BTSHFT);
+ OutStreamer.EndCOFFSymbolDef();
+ }
+
+ // Have common code print out the function header with linkage info etc.
+ EmitFunctionHeader();
+
+ // Emit the rest of the function body.
+ EmitFunctionBody();
+
+ // We didn't modify anything.
+ return false;
+}
+
+/// printSymbolOperand - Print a raw symbol reference operand. This handles
+/// jump tables, constant pools, global address and external symbols, all of
+/// which print to a label with various suffixes for relocation types etc.
+void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
+ raw_ostream &O) {
+ switch (MO.getType()) {
+ default: llvm_unreachable("unknown symbol type!");
+ case MachineOperand::MO_JumpTableIndex:
+ O << *GetJTISymbol(MO.getIndex());
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ O << *GetCPISymbol(MO.getIndex());
+ printOffset(MO.getOffset(), O);
+ break;
+ case MachineOperand::MO_GlobalAddress: {
+ const GlobalValue *GV = MO.getGlobal();
+
+ MCSymbol *GVSym;
+ if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB)
+ GVSym = GetSymbolWithGlobalValueBase(GV, "$stub");
+ else if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
+ GVSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ else
+ GVSym = Mang->getSymbol(GV);
+
+ // Handle dllimport linkage.
+ if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
+ GVSym = OutContext.GetOrCreateSymbol(Twine("__imp_") + GVSym->getName());
+
+ if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) {
+ MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
+ if (StubSym.getPointer() == 0)
+ StubSym = MachineModuleInfoImpl::
+ StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+ } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
+ MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(Sym);
+ if (StubSym.getPointer() == 0)
+ StubSym = MachineModuleInfoImpl::
+ StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+ } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
+ MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$stub");
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+ if (StubSym.getPointer() == 0)
+ StubSym = MachineModuleInfoImpl::
+ StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+ }
+
+ // If the name begins with a dollar-sign, enclose it in parens. We do this
+ // to avoid having it look like an integer immediate to the assembler.
+ if (GVSym->getName()[0] != '$')
+ O << *GVSym;
+ else
+ O << '(' << *GVSym << ')';
+ printOffset(MO.getOffset(), O);
+ break;
+ }
+ case MachineOperand::MO_ExternalSymbol: {
+ const MCSymbol *SymToPrint;
+ if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
+ SmallString<128> TempNameStr;
+ TempNameStr += StringRef(MO.getSymbolName());
+ TempNameStr += StringRef("$stub");
+
+ MCSymbol *Sym = GetExternalSymbolSymbol(TempNameStr.str());
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+ if (StubSym.getPointer() == 0) {
+ TempNameStr.erase(TempNameStr.end()-5, TempNameStr.end());
+ StubSym = MachineModuleInfoImpl::
+ StubValueTy(OutContext.GetOrCreateSymbol(TempNameStr.str()),
+ true);
+ }
+ SymToPrint = StubSym.getPointer();
+ } else {
+ SymToPrint = GetExternalSymbolSymbol(MO.getSymbolName());
+ }
+
+ // If the name begins with a dollar-sign, enclose it in parens. We do this
+ // to avoid having it look like an integer immediate to the assembler.
+ if (SymToPrint->getName()[0] != '$')
+ O << *SymToPrint;
+ else
+ O << '(' << *SymToPrint << '(';
+ break;
+ }
+ }
+
+ switch (MO.getTargetFlags()) {
+ default:
+ llvm_unreachable("Unknown target flag on GV operand");
+ case X86II::MO_NO_FLAG: // No flag.
+ break;
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DLLIMPORT:
+ case X86II::MO_DARWIN_STUB:
+ // These affect the name of the symbol, not any suffix.
+ break;
+ case X86II::MO_GOT_ABSOLUTE_ADDRESS:
+ O << " + [.-";
+ PrintPICBaseSymbol(O);
+ O << ']';
+ break;
+ case X86II::MO_PIC_BASE_OFFSET:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+ O << '-';
+ PrintPICBaseSymbol(O);
+ break;
+ case X86II::MO_TLSGD: O << "@TLSGD"; break;
+ case X86II::MO_GOTTPOFF: O << "@GOTTPOFF"; break;
+ case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break;
+ case X86II::MO_TPOFF: O << "@TPOFF"; break;
+ case X86II::MO_NTPOFF: O << "@NTPOFF"; break;
+ case X86II::MO_GOTPCREL: O << "@GOTPCREL"; break;
+ case X86II::MO_GOT: O << "@GOT"; break;
+ case X86II::MO_GOTOFF: O << "@GOTOFF"; break;
+ case X86II::MO_PLT: O << "@PLT"; break;
+ }
+}
+
+/// print_pcrel_imm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value. These print slightly differently, for
+/// example, a $ is not emitted.
+void X86AsmPrinter::print_pcrel_imm(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ switch (MO.getType()) {
+ default: llvm_unreachable("Unknown pcrel immediate operand");
+ case MachineOperand::MO_Immediate:
+ O << MO.getImm();
+ return;
+ case MachineOperand::MO_MachineBasicBlock:
+ O << *MO.getMBB()->getSymbol();
+ return;
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol:
+ printSymbolOperand(MO, O);
+ return;
+ }
+}
+
+
+void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+ raw_ostream &O, const char *Modifier) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ switch (MO.getType()) {
+ default: llvm_unreachable("unknown operand type!");
+ case MachineOperand::MO_Register: {
+ O << '%';
+ unsigned Reg = MO.getReg();
+ if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+ EVT VT = (strcmp(Modifier+6,"64") == 0) ?
+ MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
+ ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
+ Reg = getX86SubSuperRegister(Reg, VT);
+ }
+ O << X86ATTInstPrinter::getRegisterName(Reg);
+ return;
+ }
+
+ case MachineOperand::MO_Immediate:
+ O << '$' << MO.getImm();
+ return;
+
+ case MachineOperand::MO_JumpTableIndex:
+ case MachineOperand::MO_ConstantPoolIndex:
+ case MachineOperand::MO_GlobalAddress:
+ case MachineOperand::MO_ExternalSymbol: {
+ O << '$';
+ printSymbolOperand(MO, O);
+ break;
+ }
+ }
+}
+
+void X86AsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op,
+ raw_ostream &O) {
+ unsigned char value = MI->getOperand(Op).getImm();
+ assert(value <= 7 && "Invalid ssecc argument!");
+ switch (value) {
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ }
+}
+
+void X86AsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op,
+ raw_ostream &O, const char *Modifier) {
+ const MachineOperand &BaseReg = MI->getOperand(Op);
+ const MachineOperand &IndexReg = MI->getOperand(Op+2);
+ const MachineOperand &DispSpec = MI->getOperand(Op+3);
+
+ // If we really don't want to print out (rip), don't.
+ bool HasBaseReg = BaseReg.getReg() != 0;
+ if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") &&
+ BaseReg.getReg() == X86::RIP)
+ HasBaseReg = false;
+
+ // HasParenPart - True if we will print out the () part of the mem ref.
+ bool HasParenPart = IndexReg.getReg() || HasBaseReg;
+
+ if (DispSpec.isImm()) {
+ int DispVal = DispSpec.getImm();
+ if (DispVal || !HasParenPart)
+ O << DispVal;
+ } else {
+ assert(DispSpec.isGlobal() || DispSpec.isCPI() ||
+ DispSpec.isJTI() || DispSpec.isSymbol());
+ printSymbolOperand(MI->getOperand(Op+3), O);
+ }
+
+ if (HasParenPart) {
+ assert(IndexReg.getReg() != X86::ESP &&
+ "X86 doesn't allow scaling by ESP");
+
+ O << '(';
+ if (HasBaseReg)
+ printOperand(MI, Op, O, Modifier);
+
+ if (IndexReg.getReg()) {
+ O << ',';
+ printOperand(MI, Op+2, O, Modifier);
+ unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+ if (ScaleVal != 1)
+ O << ',' << ScaleVal;
+ }
+ O << ')';
+ }
+}
+
+void X86AsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+ raw_ostream &O, const char *Modifier) {
+ assert(isMem(MI, Op) && "Invalid memory reference!");
+ const MachineOperand &Segment = MI->getOperand(Op+4);
+ if (Segment.getReg()) {
+ printOperand(MI, Op+4, O, Modifier);
+ O << ':';
+ }
+ printLeaMemReference(MI, Op, O, Modifier);
+}
+
+void X86AsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op,
+ raw_ostream &O) {
+ PrintPICBaseSymbol(O);
+ O << '\n';
+ PrintPICBaseSymbol(O);
+ O << ':';
+}
+
+bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
+ raw_ostream &O) {
+ unsigned Reg = MO.getReg();
+ switch (Mode) {
+ default: return true; // Unknown mode.
+ case 'b': // Print QImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i8);
+ break;
+ case 'h': // Print QImode high register
+ Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+ break;
+ case 'w': // Print HImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i16);
+ break;
+ case 'k': // Print SImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i32);
+ break;
+ case 'q': // Print DImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i64);
+ break;
+ }
+
+ O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+ return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode, raw_ostream &O) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ const MachineOperand &MO = MI->getOperand(OpNo);
+
+ switch (ExtraCode[0]) {
+ default: return true; // Unknown modifier.
+ case 'a': // This is an address. Currently only 'i' and 'r' are expected.
+ if (MO.isImm()) {
+ O << MO.getImm();
+ return false;
+ }
+ if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol()) {
+ printSymbolOperand(MO, O);
+ return false;
+ }
+ if (MO.isReg()) {
+ O << '(';
+ printOperand(MI, OpNo, O);
+ O << ')';
+ return false;
+ }
+ return true;
+
+ case 'c': // Don't print "$" before a global var name or constant.
+ if (MO.isImm())
+ O << MO.getImm();
+ else if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol())
+ printSymbolOperand(MO, O);
+ else
+ printOperand(MI, OpNo, O);
+ return false;
+
+ case 'A': // Print '*' before a register (it must be a register)
+ if (MO.isReg()) {
+ O << '*';
+ printOperand(MI, OpNo, O);
+ return false;
+ }
+ return true;
+
+ case 'b': // Print QImode register
+ case 'h': // Print QImode high register
+ case 'w': // Print HImode register
+ case 'k': // Print SImode register
+ case 'q': // Print DImode register
+ if (MO.isReg())
+ return printAsmMRegister(MO, ExtraCode[0], O);
+ printOperand(MI, OpNo, O);
+ return false;
+
+ case 'P': // This is the operand of a call, treat specially.
+ print_pcrel_imm(MI, OpNo, O);
+ return false;
+
+ case 'n': // Negate the immediate or print a '-' before the operand.
+ // Note: this is a temporary solution. It should be handled target
+ // independently as part of the 'MC' work.
+ if (MO.isImm()) {
+ O << -MO.getImm();
+ return false;
+ }
+ O << '-';
+ }
+ }
+
+ printOperand(MI, OpNo, O);
+ return false;
+}
+
+bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo, unsigned AsmVariant,
+ const char *ExtraCode,
+ raw_ostream &O) {
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default: return true; // Unknown modifier.
+ case 'b': // Print QImode register
+ case 'h': // Print QImode high register
+ case 'w': // Print HImode register
+ case 'k': // Print SImode register
+ case 'q': // Print SImode register
+ // These only apply to registers, ignore on mem.
+ break;
+ case 'P': // Don't print @PLT, but do print as memory.
+ printMemReference(MI, OpNo, O, "no-rip");
+ return false;
+ }
+ }
+ printMemReference(MI, OpNo, O);
+ return false;
+}
+
+void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
+ if (Subtarget->isTargetDarwin())
+ OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+}
+
+
+void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
+ if (Subtarget->isTargetDarwin()) {
+ // All darwin targets use mach-o.
+ MachineModuleInfoMachO &MMIMacho =
+ MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+ // Output stubs for dynamically-linked functions.
+ MachineModuleInfoMachO::SymbolListTy Stubs;
+
+ Stubs = MMIMacho.GetFnStubList();
+ if (!Stubs.empty()) {
+ const MCSection *TheSection =
+ OutContext.getMachOSection("__IMPORT", "__jump_table",
+ MCSectionMachO::S_SYMBOL_STUBS |
+ MCSectionMachO::S_ATTR_SELF_MODIFYING_CODE |
+ MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+ 5, SectionKind::getMetadata());
+ OutStreamer.SwitchSection(TheSection);
+
+ for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+ // L_foo$stub:
+ OutStreamer.EmitLabel(Stubs[i].first);
+ // .indirect_symbol _foo
+ OutStreamer.EmitSymbolAttribute(Stubs[i].second.getPointer(),
+ MCSA_IndirectSymbol);
+ // hlt; hlt; hlt; hlt; hlt hlt = 0xf4 = -12.
+ const char HltInsts[] = { -12, -12, -12, -12, -12 };
+ OutStreamer.EmitBytes(StringRef(HltInsts, 5), 0/*addrspace*/);
+ }
+
+ Stubs.clear();
+ OutStreamer.AddBlankLine();
+ }
+
+ // Output stubs for external and common global variables.
+ Stubs = MMIMacho.GetGVStubList();
+ if (!Stubs.empty()) {
+ const MCSection *TheSection =
+ OutContext.getMachOSection("__IMPORT", "__pointers",
+ MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS,
+ SectionKind::getMetadata());
+ OutStreamer.SwitchSection(TheSection);
+
+ for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+ // L_foo$non_lazy_ptr:
+ OutStreamer.EmitLabel(Stubs[i].first);
+ // .indirect_symbol _foo
+ MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second;
+ OutStreamer.EmitSymbolAttribute(MCSym.getPointer(),
+ MCSA_IndirectSymbol);
+ // .long 0
+ if (MCSym.getInt())
+ // External to current translation unit.
+ OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/);
+ else
+ // Internal to current translation unit.
+ //
+ // When we place the LSDA into the TEXT section, the type info
+ // pointers need to be indirect and pc-rel. We accomplish this by
+ // using NLPs. However, sometimes the types are local to the file. So
+ // we need to fill in the value for the NLP in those cases.
+ OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
+ OutContext),
+ 4/*size*/, 0/*addrspace*/);
+ }
+ Stubs.clear();
+ OutStreamer.AddBlankLine();
+ }
+
+ Stubs = MMIMacho.GetHiddenGVStubList();
+ if (!Stubs.empty()) {
+ OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+ EmitAlignment(2);
+
+ for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+ // L_foo$non_lazy_ptr:
+ OutStreamer.EmitLabel(Stubs[i].first);
+ // .long _foo
+ OutStreamer.EmitValue(MCSymbolRefExpr::
+ Create(Stubs[i].second.getPointer(),
+ OutContext),
+ 4/*size*/, 0/*addrspace*/);
+ }
+ Stubs.clear();
+ OutStreamer.AddBlankLine();
+ }
+
+ // Funny Darwin hack: This flag tells the linker that no global symbols
+ // contain code that falls through to other global symbols (e.g. the obvious
+ // implementation of multiple entry points). If this doesn't occur, the
+ // linker can safely perform dead code stripping. Since LLVM never
+ // generates code that does this, it is always safe to set.
+ OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+ }
+
+ if (Subtarget->isTargetCOFF()) {
+ X86COFFMachineModuleInfo &COFFMMI =
+ MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
+
+ // Emit type information for external functions
+ typedef X86COFFMachineModuleInfo::externals_iterator externals_iterator;
+ for (externals_iterator I = COFFMMI.externals_begin(),
+ E = COFFMMI.externals_end();
+ I != E; ++I) {
+ OutStreamer.BeginCOFFSymbolDef(CurrentFnSym);
+ OutStreamer.EmitCOFFSymbolStorageClass(COFF::C_EXT);
+ OutStreamer.EmitCOFFSymbolType(COFF::DT_FCN << COFF::N_BTSHFT);
+ OutStreamer.EndCOFFSymbolDef();
+ }
+
+ // Necessary for dllexport support
+ std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals;
+
+ const TargetLoweringObjectFileCOFF &TLOFCOFF =
+ static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());
+
+ for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
+ if (I->hasDLLExportLinkage())
+ DLLExportedFns.push_back(Mang->getSymbol(I));
+
+ for (Module::const_global_iterator I = M.global_begin(),
+ E = M.global_end(); I != E; ++I)
+ if (I->hasDLLExportLinkage())
+ DLLExportedGlobals.push_back(Mang->getSymbol(I));
+
+ // Output linker support code for dllexported globals on windows.
+ if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) {
+ OutStreamer.SwitchSection(TLOFCOFF.getDrectveSection());
+ SmallString<128> name;
+ for (unsigned i = 0, e = DLLExportedGlobals.size(); i != e; ++i) {
+ if (Subtarget->isTargetWindows())
+ name = " /EXPORT:";
+ else
+ name = " -export:";
+ name += DLLExportedGlobals[i]->getName();
+ if (Subtarget->isTargetWindows())
+ name += ",DATA";
+ else
+ name += ",data";
+ OutStreamer.EmitBytes(name, 0);
+ }
+
+ for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i) {
+ if (Subtarget->isTargetWindows())
+ name = " /EXPORT:";
+ else
+ name = " -export:";
+ name += DLLExportedFns[i]->getName();
+ OutStreamer.EmitBytes(name, 0);
+ }
+ }
+ }
+
+ if (Subtarget->isTargetELF()) {
+ const TargetLoweringObjectFileELF &TLOFELF =
+ static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+ MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+ // Output stubs for external and common global variables.
+ MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+ if (!Stubs.empty()) {
+ OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+ const TargetData *TD = TM.getTargetData();
+
+ for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+ OutStreamer.EmitLabel(Stubs[i].first);
+ OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
+ TD->getPointerSize(), 0);
+ }
+ Stubs.clear();
+ }
+ }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+static MCInstPrinter *createX86MCInstPrinter(const Target &T,
+ unsigned SyntaxVariant,
+ const MCAsmInfo &MAI) {
+ if (SyntaxVariant == 0)
+ return new X86ATTInstPrinter(MAI);
+ if (SyntaxVariant == 1)
+ return new X86IntelInstPrinter(MAI);
+ return 0;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmPrinter() {
+ RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target);
+ RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target);
+
+ TargetRegistry::RegisterMCInstPrinter(TheX86_32Target,createX86MCInstPrinter);
+ TargetRegistry::RegisterMCInstPrinter(TheX86_64Target,createX86MCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/X86/AsmPrinter/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/AsmPrinter/X86AsmPrinter.h
new file mode 100644
index 0000000..b5a7f8d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmPrinter/X86AsmPrinter.h
@@ -0,0 +1,89 @@
+//===-- X86AsmPrinter.h - Convert X86 LLVM code to assembly -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AT&T assembly code printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ASMPRINTER_H
+#define X86ASMPRINTER_H
+
+#include "../X86.h"
+#include "../X86MachineFunctionInfo.h"
+#include "../X86TargetMachine.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class MachineJumpTableInfo;
+class MCContext;
+class MCInst;
+class MCStreamer;
+class MCSymbol;
+
+class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
+ const X86Subtarget *Subtarget;
+ public:
+ explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+ : AsmPrinter(TM, Streamer) {
+ Subtarget = &TM.getSubtarget<X86Subtarget>();
+ }
+
+ virtual const char *getPassName() const {
+ return "X86 AT&T-Style Assembly Printer";
+ }
+
+ const X86Subtarget &getSubtarget() const { return *Subtarget; }
+
+ virtual void EmitStartOfAsmFile(Module &M);
+
+ virtual void EmitEndOfAsmFile(Module &M);
+
+ virtual void EmitInstruction(const MachineInstr *MI);
+
+ void printSymbolOperand(const MachineOperand &MO, raw_ostream &O);
+
+ // These methods are used by the tablegen'erated instruction printer.
+ void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O,
+ const char *Modifier = 0);
+ void print_pcrel_imm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+
+ bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS);
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode,
+ raw_ostream &OS);
+
+ void printMachineInstruction(const MachineInstr *MI);
+ void printSSECC(const MachineInstr *MI, unsigned Op, raw_ostream &O);
+ void printMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O,
+ const char *Modifier=NULL);
+ void printLeaMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O,
+ const char *Modifier=NULL);
+
+ void printPICLabel(const MachineInstr *MI, unsigned Op, raw_ostream &O);
+
+ void PrintPICBaseSymbol(raw_ostream &O) const;
+
+ bool runOnMachineFunction(MachineFunction &F);
+
+ void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+ MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp
new file mode 100644
index 0000000..7e0a9bb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp
@@ -0,0 +1,138 @@
+//===-- X86IntelInstPrinter.cpp - AT&T assembly instruction printing ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as AT&T-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86IntelInstPrinter.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "X86GenInstrNames.inc"
+using namespace llvm;
+
+// Include the auto-generated portion of the assembly writer.
+#define MachineInstr MCInst
+#define GET_INSTRUCTION_NAME
+#include "X86GenAsmWriter1.inc"
+#undef MachineInstr
+
+void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) {
+ printInstruction(MI, OS);
+}
+StringRef X86IntelInstPrinter::getOpcodeName(unsigned Opcode) const {
+ return getInstructionName(Opcode);
+}
+
+void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ switch (MI->getOperand(Op).getImm()) {
+ default: assert(0 && "Invalid ssecc argument!");
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ }
+}
+
+/// print_pcrel_imm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.
+void X86IntelInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isImm())
+ O << Op.getImm();
+ else {
+ assert(Op.isExpr() && "unknown pcrel immediate operand");
+ O << *Op.getExpr();
+ }
+}
+
+static void PrintRegName(raw_ostream &O, StringRef RegName) {
+ for (unsigned i = 0, e = RegName.size(); i != e; ++i)
+ O << (char)toupper(RegName[i]);
+}
+
+void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+ raw_ostream &O) {
+ const MCOperand &Op = MI->getOperand(OpNo);
+ if (Op.isReg()) {
+ PrintRegName(O, getRegisterName(Op.getReg()));
+ } else if (Op.isImm()) {
+ O << Op.getImm();
+ } else {
+ assert(Op.isExpr() && "unknown operand kind in printOperand");
+ O << *Op.getExpr();
+ }
+}
+
+void X86IntelInstPrinter::printLeaMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ const MCOperand &BaseReg = MI->getOperand(Op);
+ unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+ const MCOperand &IndexReg = MI->getOperand(Op+2);
+ const MCOperand &DispSpec = MI->getOperand(Op+3);
+
+ O << '[';
+
+ bool NeedPlus = false;
+ if (BaseReg.getReg()) {
+ printOperand(MI, Op, O);
+ NeedPlus = true;
+ }
+
+ if (IndexReg.getReg()) {
+ if (NeedPlus) O << " + ";
+ if (ScaleVal != 1)
+ O << ScaleVal << '*';
+ printOperand(MI, Op+2, O);
+ NeedPlus = true;
+ }
+
+
+ if (!DispSpec.isImm()) {
+ if (NeedPlus) O << " + ";
+ assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+ O << *DispSpec.getExpr();
+ } else {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+ if (NeedPlus) {
+ if (DispVal > 0)
+ O << " + ";
+ else {
+ O << " - ";
+ DispVal = -DispVal;
+ }
+ }
+ O << DispVal;
+ }
+ }
+
+ O << ']';
+}
+
+void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+ raw_ostream &O) {
+ // If this has a segment register, print it.
+ if (MI->getOperand(Op+4).getReg()) {
+ printOperand(MI, Op+4, O);
+ O << ':';
+ }
+ printLeaMemReference(MI, Op, O);
+}
diff --git a/contrib/llvm/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h
new file mode 100644
index 0000000..a0beeb2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h
@@ -0,0 +1,100 @@
+//===-- X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to intel style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_INTEL_INST_PRINTER_H
+#define X86_INTEL_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+ class MCOperand;
+
+class X86IntelInstPrinter : public MCInstPrinter {
+public:
+ X86IntelInstPrinter(const MCAsmInfo &MAI)
+ : MCInstPrinter(MAI) {}
+
+ virtual void printInst(const MCInst *MI, raw_ostream &OS);
+ virtual StringRef getOpcodeName(unsigned Opcode) const;
+
+ // Autogenerated by tblgen.
+ void printInstruction(const MCInst *MI, raw_ostream &O);
+ static const char *getRegisterName(unsigned RegNo);
+ static const char *getInstructionName(unsigned Opcode);
+
+
+ void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printLeaMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O);
+ void print_pcrel_imm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+ void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "OPAQUE PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+
+ void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "BYTE PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "WORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "DWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "QWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "XMMWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "DWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "QWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "XWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "XMMWORD PTR ";
+ printMemReference(MI, OpNo, O);
+ }
+ void printlea32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "DWORD PTR ";
+ printLeaMemReference(MI, OpNo, O);
+ }
+ void printlea64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "QWORD PTR ";
+ printLeaMemReference(MI, OpNo, O);
+ }
+ void printlea64_32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+ O << "QWORD PTR ";
+ printLeaMemReference(MI, OpNo, O);
+ }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
new file mode 100644
index 0000000..4edeca9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
@@ -0,0 +1,584 @@
+//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower X86 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCInstLower.h"
+#include "X86AsmPrinter.h"
+#include "X86COFFMachineModuleInfo.h"
+#include "X86MCAsmInfo.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Type.h"
+using namespace llvm;
+
+
+const X86Subtarget &X86MCInstLower::getSubtarget() const {
+ return AsmPrinter.getSubtarget();
+}
+
+MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
+ assert(getSubtarget().isTargetDarwin() &&"Can only get MachO info on darwin");
+ return AsmPrinter.MMI->getObjFileInfo<MachineModuleInfoMachO>();
+}
+
+
+MCSymbol *X86MCInstLower::GetPICBaseSymbol() const {
+ const TargetLowering *TLI = AsmPrinter.TM.getTargetLowering();
+ return static_cast<const X86TargetLowering*>(TLI)->
+ getPICBaseSymbol(AsmPrinter.MF, Ctx);
+}
+
+/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
+/// operand to an MCSymbol.
+MCSymbol *X86MCInstLower::
+GetSymbolFromOperand(const MachineOperand &MO) const {
+ assert((MO.isGlobal() || MO.isSymbol()) && "Isn't a symbol reference");
+
+ SmallString<128> Name;
+
+ if (!MO.isGlobal()) {
+ assert(MO.isSymbol());
+ Name += AsmPrinter.MAI->getGlobalPrefix();
+ Name += MO.getSymbolName();
+ } else {
+ const GlobalValue *GV = MO.getGlobal();
+ bool isImplicitlyPrivate = false;
+ if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
+ MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
+ isImplicitlyPrivate = true;
+
+ Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
+ }
+
+ // If the target flags on the operand changes the name of the symbol, do that
+ // before we return the symbol.
+ switch (MO.getTargetFlags()) {
+ default: break;
+ case X86II::MO_DLLIMPORT: {
+ // Handle dllimport linkage.
+ const char *Prefix = "__imp_";
+ Name.insert(Name.begin(), Prefix, Prefix+strlen(Prefix));
+ break;
+ }
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
+ Name += "$non_lazy_ptr";
+ MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ getMachOMMI().getGVStubEntry(Sym);
+ if (StubSym.getPointer() == 0) {
+ assert(MO.isGlobal() && "Extern symbol not handled yet");
+ StubSym =
+ MachineModuleInfoImpl::
+ StubValueTy(AsmPrinter.Mang->getSymbol(MO.getGlobal()),
+ !MO.getGlobal()->hasInternalLinkage());
+ }
+ return Sym;
+ }
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: {
+ Name += "$non_lazy_ptr";
+ MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ getMachOMMI().getHiddenGVStubEntry(Sym);
+ if (StubSym.getPointer() == 0) {
+ assert(MO.isGlobal() && "Extern symbol not handled yet");
+ StubSym =
+ MachineModuleInfoImpl::
+ StubValueTy(AsmPrinter.Mang->getSymbol(MO.getGlobal()),
+ !MO.getGlobal()->hasInternalLinkage());
+ }
+ return Sym;
+ }
+ case X86II::MO_DARWIN_STUB: {
+ Name += "$stub";
+ MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+ MachineModuleInfoImpl::StubValueTy &StubSym =
+ getMachOMMI().getFnStubEntry(Sym);
+ if (StubSym.getPointer())
+ return Sym;
+
+ if (MO.isGlobal()) {
+ StubSym =
+ MachineModuleInfoImpl::
+ StubValueTy(AsmPrinter.Mang->getSymbol(MO.getGlobal()),
+ !MO.getGlobal()->hasInternalLinkage());
+ } else {
+ Name.erase(Name.end()-5, Name.end());
+ StubSym =
+ MachineModuleInfoImpl::
+ StubValueTy(Ctx.GetOrCreateSymbol(Name.str()), false);
+ }
+ return Sym;
+ }
+ }
+
+ return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ // FIXME: We would like an efficient form for this, so we don't have to do a
+ // lot of extra uniquing.
+ const MCExpr *Expr = 0;
+ MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+
+ switch (MO.getTargetFlags()) {
+ default: llvm_unreachable("Unknown target flag on GV operand");
+ case X86II::MO_NO_FLAG: // No flag.
+ // These affect the name of the symbol, not any suffix.
+ case X86II::MO_DARWIN_NONLAZY:
+ case X86II::MO_DLLIMPORT:
+ case X86II::MO_DARWIN_STUB:
+ break;
+
+ case X86II::MO_TLSGD: RefKind = MCSymbolRefExpr::VK_TLSGD; break;
+ case X86II::MO_GOTTPOFF: RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
+ case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
+ case X86II::MO_TPOFF: RefKind = MCSymbolRefExpr::VK_TPOFF; break;
+ case X86II::MO_NTPOFF: RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
+ case X86II::MO_GOTPCREL: RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
+ case X86II::MO_GOT: RefKind = MCSymbolRefExpr::VK_GOT; break;
+ case X86II::MO_GOTOFF: RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
+ case X86II::MO_PLT: RefKind = MCSymbolRefExpr::VK_PLT; break;
+ case X86II::MO_PIC_BASE_OFFSET:
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+ Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+ // Subtract the pic base.
+ Expr = MCBinaryExpr::CreateSub(Expr,
+ MCSymbolRefExpr::Create(GetPICBaseSymbol(), Ctx),
+ Ctx);
+ if (MO.isJTI() && AsmPrinter.MAI->hasSetDirective()) {
+ // If .set directive is supported, use it to reduce the number of
+ // relocations the assembler will generate for differences between
+ // local labels. This is only safe when the symbols are in the same
+ // section so we are restricting it to jumptable references.
+ MCSymbol *Label = Ctx.CreateTempSymbol();
+ AsmPrinter.OutStreamer.EmitAssignment(Label, Expr);
+ Expr = MCSymbolRefExpr::Create(Label, Ctx);
+ }
+ break;
+ }
+
+ if (Expr == 0)
+ Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
+
+ if (!MO.isJTI() && MO.getOffset())
+ Expr = MCBinaryExpr::CreateAdd(Expr,
+ MCConstantExpr::Create(MO.getOffset(), Ctx),
+ Ctx);
+ return MCOperand::CreateExpr(Expr);
+}
+
+
+
+static void lower_subreg32(MCInst *MI, unsigned OpNo) {
+ // Convert registers in the addr mode according to subreg32.
+ unsigned Reg = MI->getOperand(OpNo).getReg();
+ if (Reg != 0)
+ MI->getOperand(OpNo).setReg(getX86SubSuperRegister(Reg, MVT::i32));
+}
+
+static void lower_lea64_32mem(MCInst *MI, unsigned OpNo) {
+ // Convert registers in the addr mode according to subreg64.
+ for (unsigned i = 0; i != 4; ++i) {
+ if (!MI->getOperand(OpNo+i).isReg()) continue;
+
+ unsigned Reg = MI->getOperand(OpNo+i).getReg();
+ if (Reg == 0) continue;
+
+ MI->getOperand(OpNo+i).setReg(getX86SubSuperRegister(Reg, MVT::i64));
+ }
+}
+
+/// LowerSubReg32_Op0 - Things like MOVZX16rr8 -> MOVZX32rr8.
+static void LowerSubReg32_Op0(MCInst &OutMI, unsigned NewOpc) {
+ OutMI.setOpcode(NewOpc);
+ lower_subreg32(&OutMI, 0);
+}
+/// LowerUnaryToTwoAddr - R = setb -> R = sbb R, R
+static void LowerUnaryToTwoAddr(MCInst &OutMI, unsigned NewOpc) {
+ OutMI.setOpcode(NewOpc);
+ OutMI.addOperand(OutMI.getOperand(0));
+ OutMI.addOperand(OutMI.getOperand(0));
+}
+
+/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
+/// a short fixed-register form.
+static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
+ unsigned ImmOp = Inst.getNumOperands() - 1;
+ assert(Inst.getOperand(0).isReg() && Inst.getOperand(ImmOp).isImm() &&
+ ((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
+ Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) ||
+ Inst.getNumOperands() == 2) && "Unexpected instruction!");
+
+ // Check whether the destination register can be fixed.
+ unsigned Reg = Inst.getOperand(0).getReg();
+ if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+ return;
+
+ // If so, rewrite the instruction.
+ MCOperand Saved = Inst.getOperand(ImmOp);
+ Inst = MCInst();
+ Inst.setOpcode(Opcode);
+ Inst.addOperand(Saved);
+}
+
+/// \brief Simplify things like MOV32rm to MOV32o32a.
+static void SimplifyShortMoveForm(MCInst &Inst, unsigned Opcode) {
+ bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg();
+ unsigned AddrBase = IsStore;
+ unsigned RegOp = IsStore ? 0 : 5;
+ unsigned AddrOp = AddrBase + 3;
+ assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
+ Inst.getOperand(AddrBase + 0).isReg() && // base
+ Inst.getOperand(AddrBase + 1).isImm() && // scale
+ Inst.getOperand(AddrBase + 2).isReg() && // index register
+ (Inst.getOperand(AddrOp).isExpr() || // address
+ Inst.getOperand(AddrOp).isImm())&&
+ Inst.getOperand(AddrBase + 4).isReg() && // segment
+ "Unexpected instruction!");
+
+ // Check whether the destination register can be fixed.
+ unsigned Reg = Inst.getOperand(RegOp).getReg();
+ if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+ return;
+
+ // Check whether this is an absolute address.
+ if (Inst.getOperand(AddrBase + 0).getReg() != 0 ||
+ Inst.getOperand(AddrBase + 2).getReg() != 0 ||
+ Inst.getOperand(AddrBase + 4).getReg() != 0 ||
+ Inst.getOperand(AddrBase + 1).getImm() != 1)
+ return;
+
+ // If so, rewrite the instruction.
+ MCOperand Saved = Inst.getOperand(AddrOp);
+ Inst = MCInst();
+ Inst.setOpcode(Opcode);
+ Inst.addOperand(Saved);
+}
+
+void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+ OutMI.setOpcode(MI->getOpcode());
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+
+ MCOperand MCOp;
+ switch (MO.getType()) {
+ default:
+ MI->dump();
+ llvm_unreachable("unknown operand type");
+ case MachineOperand::MO_Register:
+ // Ignore all implicit register operands.
+ if (MO.isImplicit()) continue;
+ MCOp = MCOperand::CreateReg(MO.getReg());
+ break;
+ case MachineOperand::MO_Immediate:
+ MCOp = MCOperand::CreateImm(MO.getImm());
+ break;
+ case MachineOperand::MO_MachineBasicBlock:
+ MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+ MO.getMBB()->getSymbol(), Ctx));
+ break;
+ case MachineOperand::MO_GlobalAddress:
+ MCOp = LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
+ break;
+ case MachineOperand::MO_ExternalSymbol:
+ MCOp = LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
+ break;
+ case MachineOperand::MO_JumpTableIndex:
+ MCOp = LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()));
+ break;
+ case MachineOperand::MO_ConstantPoolIndex:
+ MCOp = LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()));
+ break;
+ case MachineOperand::MO_BlockAddress:
+ MCOp = LowerSymbolOperand(MO,
+ AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
+ break;
+ }
+
+ OutMI.addOperand(MCOp);
+ }
+
+ // Handle a few special cases to eliminate operand modifiers.
+ switch (OutMI.getOpcode()) {
+ case X86::LEA64_32r: // Handle 'subreg rewriting' for the lea64_32mem operand.
+ lower_lea64_32mem(&OutMI, 1);
+ break;
+ case X86::MOVZX16rr8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break;
+ case X86::MOVZX16rm8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break;
+ case X86::MOVSX16rr8: LowerSubReg32_Op0(OutMI, X86::MOVSX32rr8); break;
+ case X86::MOVSX16rm8: LowerSubReg32_Op0(OutMI, X86::MOVSX32rm8); break;
+ case X86::MOVZX64rr32: LowerSubReg32_Op0(OutMI, X86::MOV32rr); break;
+ case X86::MOVZX64rm32: LowerSubReg32_Op0(OutMI, X86::MOV32rm); break;
+ case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break;
+ case X86::MOVZX64rr8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break;
+ case X86::MOVZX64rm8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break;
+ case X86::MOVZX64rr16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr16); break;
+ case X86::MOVZX64rm16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm16); break;
+ case X86::SETB_C8r: LowerUnaryToTwoAddr(OutMI, X86::SBB8rr); break;
+ case X86::SETB_C16r: LowerUnaryToTwoAddr(OutMI, X86::SBB16rr); break;
+ case X86::SETB_C32r: LowerUnaryToTwoAddr(OutMI, X86::SBB32rr); break;
+ case X86::SETB_C64r: LowerUnaryToTwoAddr(OutMI, X86::SBB64rr); break;
+ case X86::MOV8r0: LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break;
+ case X86::MOV32r0: LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break;
+ case X86::MMX_V_SET0: LowerUnaryToTwoAddr(OutMI, X86::MMX_PXORrr); break;
+ case X86::MMX_V_SETALLONES:
+ LowerUnaryToTwoAddr(OutMI, X86::MMX_PCMPEQDrr); break;
+ case X86::FsFLD0SS: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
+ case X86::FsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
+ case X86::V_SET0PS: LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break;
+ case X86::V_SET0PD: LowerUnaryToTwoAddr(OutMI, X86::XORPDrr); break;
+ case X86::V_SET0PI: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
+ case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break;
+
+ case X86::MOV16r0:
+ LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0
+ LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
+ break;
+ case X86::MOV64r0:
+ LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV64r0 -> MOV32r0
+ LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
+ break;
+
+ // TAILJMPr, TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have
+ // register inputs modeled as normal uses instead of implicit uses. As such,
+ // truncate off all but the first operand (the callee). FIXME: Change isel.
+ case X86::TAILJMPr:
+ case X86::TAILJMPr64:
+ case X86::CALL64r:
+ case X86::CALL64pcrel32: {
+ unsigned Opcode = OutMI.getOpcode();
+ MCOperand Saved = OutMI.getOperand(0);
+ OutMI = MCInst();
+ OutMI.setOpcode(Opcode);
+ OutMI.addOperand(Saved);
+ break;
+ }
+
+ // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions.
+ case X86::TAILJMPd:
+ case X86::TAILJMPd64: {
+ MCOperand Saved = OutMI.getOperand(0);
+ OutMI = MCInst();
+ OutMI.setOpcode(X86::TAILJMP_1);
+ OutMI.addOperand(Saved);
+ break;
+ }
+
+ // The assembler backend wants to see branches in their small form and relax
+ // them to their large form. The JIT can only handle the large form because
+ // it does not do relaxation. For now, translate the large form to the
+ // small one here.
+ case X86::JMP_4: OutMI.setOpcode(X86::JMP_1); break;
+ case X86::JO_4: OutMI.setOpcode(X86::JO_1); break;
+ case X86::JNO_4: OutMI.setOpcode(X86::JNO_1); break;
+ case X86::JB_4: OutMI.setOpcode(X86::JB_1); break;
+ case X86::JAE_4: OutMI.setOpcode(X86::JAE_1); break;
+ case X86::JE_4: OutMI.setOpcode(X86::JE_1); break;
+ case X86::JNE_4: OutMI.setOpcode(X86::JNE_1); break;
+ case X86::JBE_4: OutMI.setOpcode(X86::JBE_1); break;
+ case X86::JA_4: OutMI.setOpcode(X86::JA_1); break;
+ case X86::JS_4: OutMI.setOpcode(X86::JS_1); break;
+ case X86::JNS_4: OutMI.setOpcode(X86::JNS_1); break;
+ case X86::JP_4: OutMI.setOpcode(X86::JP_1); break;
+ case X86::JNP_4: OutMI.setOpcode(X86::JNP_1); break;
+ case X86::JL_4: OutMI.setOpcode(X86::JL_1); break;
+ case X86::JGE_4: OutMI.setOpcode(X86::JGE_1); break;
+ case X86::JLE_4: OutMI.setOpcode(X86::JLE_1); break;
+ case X86::JG_4: OutMI.setOpcode(X86::JG_1); break;
+
+ // We don't currently select the correct instruction form for instructions
+ // which have a short %eax, etc. form. Handle this by custom lowering, for
+ // now.
+ //
+ // Note, we are currently not handling the following instructions:
+ // MOV64ao8, MOV64o8a
+ // XCHG16ar, XCHG32ar, XCHG64ar
+ case X86::MOV8mr_NOREX:
+ case X86::MOV8mr: SimplifyShortMoveForm(OutMI, X86::MOV8ao8); break;
+ case X86::MOV8rm_NOREX:
+ case X86::MOV8rm: SimplifyShortMoveForm(OutMI, X86::MOV8o8a); break;
+ case X86::MOV16mr: SimplifyShortMoveForm(OutMI, X86::MOV16ao16); break;
+ case X86::MOV16rm: SimplifyShortMoveForm(OutMI, X86::MOV16o16a); break;
+ case X86::MOV32mr: SimplifyShortMoveForm(OutMI, X86::MOV32ao32); break;
+ case X86::MOV32rm: SimplifyShortMoveForm(OutMI, X86::MOV32o32a); break;
+ case X86::MOV64mr: SimplifyShortMoveForm(OutMI, X86::MOV64ao64); break;
+ case X86::MOV64rm: SimplifyShortMoveForm(OutMI, X86::MOV64o64a); break;
+
+ case X86::ADC8ri: SimplifyShortImmForm(OutMI, X86::ADC8i8); break;
+ case X86::ADC16ri: SimplifyShortImmForm(OutMI, X86::ADC16i16); break;
+ case X86::ADC32ri: SimplifyShortImmForm(OutMI, X86::ADC32i32); break;
+ case X86::ADC64ri32: SimplifyShortImmForm(OutMI, X86::ADC64i32); break;
+ case X86::ADD8ri: SimplifyShortImmForm(OutMI, X86::ADD8i8); break;
+ case X86::ADD16ri: SimplifyShortImmForm(OutMI, X86::ADD16i16); break;
+ case X86::ADD32ri: SimplifyShortImmForm(OutMI, X86::ADD32i32); break;
+ case X86::ADD64ri32: SimplifyShortImmForm(OutMI, X86::ADD64i32); break;
+ case X86::AND8ri: SimplifyShortImmForm(OutMI, X86::AND8i8); break;
+ case X86::AND16ri: SimplifyShortImmForm(OutMI, X86::AND16i16); break;
+ case X86::AND32ri: SimplifyShortImmForm(OutMI, X86::AND32i32); break;
+ case X86::AND64ri32: SimplifyShortImmForm(OutMI, X86::AND64i32); break;
+ case X86::CMP8ri: SimplifyShortImmForm(OutMI, X86::CMP8i8); break;
+ case X86::CMP16ri: SimplifyShortImmForm(OutMI, X86::CMP16i16); break;
+ case X86::CMP32ri: SimplifyShortImmForm(OutMI, X86::CMP32i32); break;
+ case X86::CMP64ri32: SimplifyShortImmForm(OutMI, X86::CMP64i32); break;
+ case X86::OR8ri: SimplifyShortImmForm(OutMI, X86::OR8i8); break;
+ case X86::OR16ri: SimplifyShortImmForm(OutMI, X86::OR16i16); break;
+ case X86::OR32ri: SimplifyShortImmForm(OutMI, X86::OR32i32); break;
+ case X86::OR64ri32: SimplifyShortImmForm(OutMI, X86::OR64i32); break;
+ case X86::SBB8ri: SimplifyShortImmForm(OutMI, X86::SBB8i8); break;
+ case X86::SBB16ri: SimplifyShortImmForm(OutMI, X86::SBB16i16); break;
+ case X86::SBB32ri: SimplifyShortImmForm(OutMI, X86::SBB32i32); break;
+ case X86::SBB64ri32: SimplifyShortImmForm(OutMI, X86::SBB64i32); break;
+ case X86::SUB8ri: SimplifyShortImmForm(OutMI, X86::SUB8i8); break;
+ case X86::SUB16ri: SimplifyShortImmForm(OutMI, X86::SUB16i16); break;
+ case X86::SUB32ri: SimplifyShortImmForm(OutMI, X86::SUB32i32); break;
+ case X86::SUB64ri32: SimplifyShortImmForm(OutMI, X86::SUB64i32); break;
+ case X86::TEST8ri: SimplifyShortImmForm(OutMI, X86::TEST8i8); break;
+ case X86::TEST16ri: SimplifyShortImmForm(OutMI, X86::TEST16i16); break;
+ case X86::TEST32ri: SimplifyShortImmForm(OutMI, X86::TEST32i32); break;
+ case X86::TEST64ri32: SimplifyShortImmForm(OutMI, X86::TEST64i32); break;
+ case X86::XOR8ri: SimplifyShortImmForm(OutMI, X86::XOR8i8); break;
+ case X86::XOR16ri: SimplifyShortImmForm(OutMI, X86::XOR16i16); break;
+ case X86::XOR32ri: SimplifyShortImmForm(OutMI, X86::XOR32i32); break;
+ case X86::XOR64ri32: SimplifyShortImmForm(OutMI, X86::XOR64i32); break;
+ }
+}
+
+void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+ raw_ostream &O) {
+ // Only the target-dependent form of DBG_VALUE should get here.
+ // Referencing the offset and metadata as NOps-2 and NOps-1 is
+ // probably portable to other targets; frame pointer location is not.
+ unsigned NOps = MI->getNumOperands();
+ assert(NOps==7);
+ O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+ // cast away const; DIetc do not take const operands for some reason.
+ DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
+ if (V.getContext().isSubprogram())
+ O << DISubprogram(V.getContext()).getDisplayName() << ":";
+ O << V.getName();
+ O << " <- ";
+ // Frame address. Currently handles register +- offset only.
+ assert(MI->getOperand(0).isReg() && MI->getOperand(3).isImm());
+ O << '['; printOperand(MI, 0, O); O << '+'; printOperand(MI, 3, O);
+ O << ']';
+ O << "+";
+ printOperand(MI, NOps-2, O);
+}
+
+MachineLocation
+X86AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
+ MachineLocation Location;
+ assert (MI->getNumOperands() == 7 && "Invalid no. of machine operands!");
+ // Frame address. Currently handles register +- offset only.
+ assert(MI->getOperand(0).isReg() && MI->getOperand(3).isImm());
+ Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm());
+ return Location;
+}
+
+
+void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+ X86MCInstLower MCInstLowering(OutContext, Mang, *this);
+ switch (MI->getOpcode()) {
+ case TargetOpcode::DBG_VALUE:
+ if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+ std::string TmpStr;
+ raw_string_ostream OS(TmpStr);
+ PrintDebugValueComment(MI, OS);
+ OutStreamer.EmitRawText(StringRef(OS.str()));
+ }
+ return;
+
+ case X86::MOVPC32r: {
+ MCInst TmpInst;
+ // This is a pseudo op for a two instruction sequence with a label, which
+ // looks like:
+ // call "L1$pb"
+ // "L1$pb":
+ // popl %esi
+
+ // Emit the call.
+ MCSymbol *PICBase = MCInstLowering.GetPICBaseSymbol();
+ TmpInst.setOpcode(X86::CALLpcrel32);
+ // FIXME: We would like an efficient form for this, so we don't have to do a
+ // lot of extra uniquing.
+ TmpInst.addOperand(MCOperand::CreateExpr(MCSymbolRefExpr::Create(PICBase,
+ OutContext)));
+ OutStreamer.EmitInstruction(TmpInst);
+
+ // Emit the label.
+ OutStreamer.EmitLabel(PICBase);
+
+ // popl $reg
+ TmpInst.setOpcode(X86::POP32r);
+ TmpInst.getOperand(0) = MCOperand::CreateReg(MI->getOperand(0).getReg());
+ OutStreamer.EmitInstruction(TmpInst);
+ return;
+ }
+
+ case X86::ADD32ri: {
+ // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
+ if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
+ break;
+
+ // Okay, we have something like:
+ // EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
+
+ // For this, we want to print something like:
+ // MYGLOBAL + (. - PICBASE)
+ // However, we can't generate a ".", so just emit a new label here and refer
+ // to it.
+ MCSymbol *DotSym = OutContext.CreateTempSymbol();
+ OutStreamer.EmitLabel(DotSym);
+
+ // Now that we have emitted the label, lower the complex operand expression.
+ MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
+
+ const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext);
+ const MCExpr *PICBase =
+ MCSymbolRefExpr::Create(MCInstLowering.GetPICBaseSymbol(), OutContext);
+ DotExpr = MCBinaryExpr::CreateSub(DotExpr, PICBase, OutContext);
+
+ DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext),
+ DotExpr, OutContext);
+
+ MCInst TmpInst;
+ TmpInst.setOpcode(X86::ADD32ri);
+ TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+ TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+ TmpInst.addOperand(MCOperand::CreateExpr(DotExpr));
+ OutStreamer.EmitInstruction(TmpInst);
+ return;
+ }
+ }
+
+ MCInst TmpInst;
+ MCInstLowering.Lower(MI, TmpInst);
+
+ OutStreamer.EmitInstruction(TmpInst);
+}
+
diff --git a/contrib/llvm/lib/Target/X86/AsmPrinter/X86MCInstLower.h b/contrib/llvm/lib/Target/X86/AsmPrinter/X86MCInstLower.h
new file mode 100644
index 0000000..9e5474f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmPrinter/X86MCInstLower.h
@@ -0,0 +1,51 @@
+//===-- X86MCInstLower.h - Lower MachineInstr to MCInst -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_MCINSTLOWER_H
+#define X86_MCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+ class MCContext;
+ class MCInst;
+ class MCOperand;
+ class MCSymbol;
+ class MachineInstr;
+ class MachineModuleInfoMachO;
+ class MachineOperand;
+ class Mangler;
+ class X86AsmPrinter;
+ class X86Subtarget;
+
+/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class LLVM_LIBRARY_VISIBILITY X86MCInstLower {
+ MCContext &Ctx;
+ Mangler *Mang;
+ X86AsmPrinter &AsmPrinter;
+
+ const X86Subtarget &getSubtarget() const;
+public:
+ X86MCInstLower(MCContext &ctx, Mangler *mang, X86AsmPrinter &asmprinter)
+ : Ctx(ctx), Mang(mang), AsmPrinter(asmprinter) {}
+
+ void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+ MCSymbol *GetPICBaseSymbol() const;
+
+ MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
+ MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+private:
+ MachineModuleInfoMachO &getMachOMMI() const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/CMakeLists.txt b/contrib/llvm/lib/Target/X86/CMakeLists.txt
new file mode 100644
index 0000000..1334820
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/CMakeLists.txt
@@ -0,0 +1,52 @@
+set(LLVM_TARGET_DEFINITIONS X86.td)
+
+tablegen(X86GenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(X86GenRegisterNames.inc -gen-register-enums)
+tablegen(X86GenRegisterInfo.inc -gen-register-desc)
+tablegen(X86GenDisassemblerTables.inc -gen-disassembler)
+tablegen(X86GenInstrNames.inc -gen-instr-enums)
+tablegen(X86GenInstrInfo.inc -gen-instr-desc)
+tablegen(X86GenAsmWriter.inc -gen-asm-writer)
+tablegen(X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
+tablegen(X86GenAsmMatcher.inc -gen-asm-matcher)
+tablegen(X86GenDAGISel.inc -gen-dag-isel)
+tablegen(X86GenFastISel.inc -gen-fast-isel)
+tablegen(X86GenCallingConv.inc -gen-callingconv)
+tablegen(X86GenSubtarget.inc -gen-subtarget)
+tablegen(X86GenEDInfo.inc -gen-enhanced-disassembly-info)
+
+set(sources
+ SSEDomainFix.cpp
+ X86AsmBackend.cpp
+ X86CodeEmitter.cpp
+ X86COFFMachineModuleInfo.cpp
+ X86ELFWriterInfo.cpp
+ X86FloatingPoint.cpp
+ X86FloatingPointRegKill.cpp
+ X86ISelDAGToDAG.cpp
+ X86ISelLowering.cpp
+ X86InstrInfo.cpp
+ X86JITInfo.cpp
+ X86MCAsmInfo.cpp
+ X86MCCodeEmitter.cpp
+ X86RegisterInfo.cpp
+ X86Subtarget.cpp
+ X86TargetMachine.cpp
+ X86TargetObjectFile.cpp
+ X86FastISel.cpp
+ X86SelectionDAGInfo.cpp
+ )
+
+if( CMAKE_CL_64 )
+ enable_language(ASM_MASM)
+ ADD_CUSTOM_COMMAND(
+ OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj
+ COMMAND ${CMAKE_ASM_MASM_COMPILER} /Fo ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj /c ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm
+ DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/X86CompilationCallback_Win64.asm
+ )
+ set(sources ${sources} ${CMAKE_CURRENT_BINARY_DIR}/X86CompilationCallback_Win64.obj)
+endif()
+
+add_llvm_target(X86CodeGen ${sources})
+
+target_link_libraries (LLVMX86CodeGen LLVMSelectionDAG)
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/CMakeLists.txt b/contrib/llvm/lib/Target/X86/Disassembler/CMakeLists.txt
new file mode 100644
index 0000000..9f91060
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/CMakeLists.txt
@@ -0,0 +1,14 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMX86Disassembler
+ X86Disassembler.cpp
+ X86DisassemblerDecoder.c
+ )
+# workaround for hanging compilation on MSVC9
+if( MSVC_VERSION EQUAL 1500 )
+set_property(
+ SOURCE X86Disassembler.cpp
+ PROPERTY COMPILE_FLAGS "/Od"
+ )
+endif()
+add_dependencies(LLVMX86Disassembler X86CodeGenTable_gen)
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/Makefile b/contrib/llvm/lib/Target/X86/Disassembler/Makefile
new file mode 100644
index 0000000..8669fd8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/X86/Disassembler/Makefile ----------------*- Makefile -*-===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMX86Disassembler
+
+# Hack: we need to include 'main' x86 target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
new file mode 100644
index 0000000..8a5a630
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -0,0 +1,566 @@
+//===- X86Disassembler.cpp - Disassembler for x86 and x86_64 ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains code to translate the data produced by the decoder into
+// MCInsts.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Disassembler.h"
+#include "X86DisassemblerDecoder.h"
+
+#include "llvm/MC/EDInstInfo.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "X86GenRegisterNames.inc"
+#include "X86GenEDInfo.inc"
+
+using namespace llvm;
+using namespace llvm::X86Disassembler;
+
+void x86DisassemblerDebug(const char *file,
+ unsigned line,
+ const char *s) {
+ dbgs() << file << ":" << line << ": " << s;
+}
+
+#define debug(s) DEBUG(x86DisassemblerDebug(__FILE__, __LINE__, s));
+
+namespace llvm {
+
+// Fill-ins to make the compiler happy. These constants are never actually
+// assigned; they are just filler to make an automatically-generated switch
+// statement work.
+namespace X86 {
+ enum {
+ BX_SI = 500,
+ BX_DI = 501,
+ BP_SI = 502,
+ BP_DI = 503,
+ sib = 504,
+ sib64 = 505
+ };
+}
+
+extern Target TheX86_32Target, TheX86_64Target;
+
+}
+
+static bool translateInstruction(MCInst &target,
+ InternalInstruction &source);
+
+X86GenericDisassembler::X86GenericDisassembler(DisassemblerMode mode) :
+ MCDisassembler(),
+ fMode(mode) {
+}
+
+X86GenericDisassembler::~X86GenericDisassembler() {
+}
+
+EDInstInfo *X86GenericDisassembler::getEDInfo() const {
+ return instInfoX86;
+}
+
+/// regionReader - a callback function that wraps the readByte method from
+/// MemoryObject.
+///
+/// @param arg - The generic callback parameter. In this case, this should
+/// be a pointer to a MemoryObject.
+/// @param byte - A pointer to the byte to be read.
+/// @param address - The address to be read.
+static int regionReader(void* arg, uint8_t* byte, uint64_t address) {
+ MemoryObject* region = static_cast<MemoryObject*>(arg);
+ return region->readByte(address, byte);
+}
+
+/// logger - a callback function that wraps the operator<< method from
+/// raw_ostream.
+///
+/// @param arg - The generic callback parameter. This should be a pointe
+/// to a raw_ostream.
+/// @param log - A string to be logged. logger() adds a newline.
+static void logger(void* arg, const char* log) {
+ if (!arg)
+ return;
+
+ raw_ostream &vStream = *(static_cast<raw_ostream*>(arg));
+ vStream << log << "\n";
+}
+
+//
+// Public interface for the disassembler
+//
+
+bool X86GenericDisassembler::getInstruction(MCInst &instr,
+ uint64_t &size,
+ const MemoryObject &region,
+ uint64_t address,
+ raw_ostream &vStream) const {
+ InternalInstruction internalInstr;
+
+ int ret = decodeInstruction(&internalInstr,
+ regionReader,
+ (void*)&region,
+ logger,
+ (void*)&vStream,
+ address,
+ fMode);
+
+ if (ret) {
+ size = internalInstr.readerCursor - address;
+ return false;
+ }
+ else {
+ size = internalInstr.length;
+ return !translateInstruction(instr, internalInstr);
+ }
+}
+
+//
+// Private code that translates from struct InternalInstructions to MCInsts.
+//
+
+/// translateRegister - Translates an internal register to the appropriate LLVM
+/// register, and appends it as an operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param reg - The Reg to append.
+static void translateRegister(MCInst &mcInst, Reg reg) {
+#define ENTRY(x) X86::x,
+ uint8_t llvmRegnums[] = {
+ ALL_REGS
+ 0
+ };
+#undef ENTRY
+
+ uint8_t llvmRegnum = llvmRegnums[reg];
+ mcInst.addOperand(MCOperand::CreateReg(llvmRegnum));
+}
+
+/// translateImmediate - Appends an immediate operand to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param immediate - The immediate value to append.
+/// @param operand - The operand, as stored in the descriptor table.
+/// @param insn - The internal instruction.
+static void translateImmediate(MCInst &mcInst,
+ uint64_t immediate,
+ OperandSpecifier &operand,
+ InternalInstruction &insn) {
+ // Sign-extend the immediate if necessary.
+
+ OperandType type = operand.type;
+
+ if (type == TYPE_RELv) {
+ switch (insn.displacementSize) {
+ default:
+ break;
+ case 8:
+ type = TYPE_MOFFS8;
+ break;
+ case 16:
+ type = TYPE_MOFFS16;
+ break;
+ case 32:
+ type = TYPE_MOFFS32;
+ break;
+ case 64:
+ type = TYPE_MOFFS64;
+ break;
+ }
+ }
+
+ switch (type) {
+ case TYPE_MOFFS8:
+ case TYPE_REL8:
+ if(immediate & 0x80)
+ immediate |= ~(0xffull);
+ break;
+ case TYPE_MOFFS16:
+ if(immediate & 0x8000)
+ immediate |= ~(0xffffull);
+ break;
+ case TYPE_MOFFS32:
+ case TYPE_REL32:
+ case TYPE_REL64:
+ if(immediate & 0x80000000)
+ immediate |= ~(0xffffffffull);
+ break;
+ case TYPE_MOFFS64:
+ default:
+ // operand is 64 bits wide. Do nothing.
+ break;
+ }
+
+ mcInst.addOperand(MCOperand::CreateImm(immediate));
+}
+
+/// translateRMRegister - Translates a register stored in the R/M field of the
+/// ModR/M byte to its LLVM equivalent and appends it to an MCInst.
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The internal instruction to extract the R/M field
+/// from.
+/// @return - 0 on success; -1 otherwise
+static bool translateRMRegister(MCInst &mcInst,
+ InternalInstruction &insn) {
+ if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+ debug("A R/M register operand may not have a SIB byte");
+ return true;
+ }
+
+ switch (insn.eaBase) {
+ default:
+ debug("Unexpected EA base register");
+ return true;
+ case EA_BASE_NONE:
+ debug("EA_BASE_NONE for ModR/M base");
+ return true;
+#define ENTRY(x) case EA_BASE_##x:
+ ALL_EA_BASES
+#undef ENTRY
+ debug("A R/M register operand may not have a base; "
+ "the operand must be a register.");
+ return true;
+#define ENTRY(x) \
+ case EA_REG_##x: \
+ mcInst.addOperand(MCOperand::CreateReg(X86::x)); break;
+ ALL_REGS
+#undef ENTRY
+ }
+
+ return false;
+}
+
+/// translateRMMemory - Translates a memory operand stored in the Mod and R/M
+/// fields of an internal instruction (and possibly its SIB byte) to a memory
+/// operand in LLVM's format, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param insn - The instruction to extract Mod, R/M, and SIB fields
+/// from.
+/// @param sr - Whether or not to emit the segment register. The
+/// LEA instruction does not expect a segment-register
+/// operand.
+/// @return - 0 on success; nonzero otherwise
+static bool translateRMMemory(MCInst &mcInst,
+ InternalInstruction &insn,
+ bool sr) {
+ // Addresses in an MCInst are represented as five operands:
+ // 1. basereg (register) The R/M base, or (if there is a SIB) the
+ // SIB base
+ // 2. scaleamount (immediate) 1, or (if there is a SIB) the specified
+ // scale amount
+ // 3. indexreg (register) x86_registerNONE, or (if there is a SIB)
+ // the index (which is multiplied by the
+ // scale amount)
+ // 4. displacement (immediate) 0, or the displacement if there is one
+ // 5. segmentreg (register) x86_registerNONE for now, but could be set
+ // if we have segment overrides
+
+ MCOperand baseReg;
+ MCOperand scaleAmount;
+ MCOperand indexReg;
+ MCOperand displacement;
+ MCOperand segmentReg;
+
+ if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+ if (insn.sibBase != SIB_BASE_NONE) {
+ switch (insn.sibBase) {
+ default:
+ debug("Unexpected sibBase");
+ return true;
+#define ENTRY(x) \
+ case SIB_BASE_##x: \
+ baseReg = MCOperand::CreateReg(X86::x); break;
+ ALL_SIB_BASES
+#undef ENTRY
+ }
+ } else {
+ baseReg = MCOperand::CreateReg(0);
+ }
+
+ if (insn.sibIndex != SIB_INDEX_NONE) {
+ switch (insn.sibIndex) {
+ default:
+ debug("Unexpected sibIndex");
+ return true;
+#define ENTRY(x) \
+ case SIB_INDEX_##x: \
+ indexReg = MCOperand::CreateReg(X86::x); break;
+ EA_BASES_32BIT
+ EA_BASES_64BIT
+#undef ENTRY
+ }
+ } else {
+ indexReg = MCOperand::CreateReg(0);
+ }
+
+ scaleAmount = MCOperand::CreateImm(insn.sibScale);
+ } else {
+ switch (insn.eaBase) {
+ case EA_BASE_NONE:
+ if (insn.eaDisplacement == EA_DISP_NONE) {
+ debug("EA_BASE_NONE and EA_DISP_NONE for ModR/M base");
+ return true;
+ }
+ if (insn.mode == MODE_64BIT)
+ baseReg = MCOperand::CreateReg(X86::RIP); // Section 2.2.1.6
+ else
+ baseReg = MCOperand::CreateReg(0);
+
+ indexReg = MCOperand::CreateReg(0);
+ break;
+ case EA_BASE_BX_SI:
+ baseReg = MCOperand::CreateReg(X86::BX);
+ indexReg = MCOperand::CreateReg(X86::SI);
+ break;
+ case EA_BASE_BX_DI:
+ baseReg = MCOperand::CreateReg(X86::BX);
+ indexReg = MCOperand::CreateReg(X86::DI);
+ break;
+ case EA_BASE_BP_SI:
+ baseReg = MCOperand::CreateReg(X86::BP);
+ indexReg = MCOperand::CreateReg(X86::SI);
+ break;
+ case EA_BASE_BP_DI:
+ baseReg = MCOperand::CreateReg(X86::BP);
+ indexReg = MCOperand::CreateReg(X86::DI);
+ break;
+ default:
+ indexReg = MCOperand::CreateReg(0);
+ switch (insn.eaBase) {
+ default:
+ debug("Unexpected eaBase");
+ return true;
+ // Here, we will use the fill-ins defined above. However,
+ // BX_SI, BX_DI, BP_SI, and BP_DI are all handled above and
+ // sib and sib64 were handled in the top-level if, so they're only
+ // placeholders to keep the compiler happy.
+#define ENTRY(x) \
+ case EA_BASE_##x: \
+ baseReg = MCOperand::CreateReg(X86::x); break;
+ ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) case EA_REG_##x:
+ ALL_REGS
+#undef ENTRY
+ debug("A R/M memory operand may not be a register; "
+ "the base field must be a base.");
+ return true;
+ }
+ }
+
+ scaleAmount = MCOperand::CreateImm(1);
+ }
+
+ displacement = MCOperand::CreateImm(insn.displacement);
+
+ static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = {
+ 0, // SEG_OVERRIDE_NONE
+ X86::CS,
+ X86::SS,
+ X86::DS,
+ X86::ES,
+ X86::FS,
+ X86::GS
+ };
+
+ segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]);
+
+ mcInst.addOperand(baseReg);
+ mcInst.addOperand(scaleAmount);
+ mcInst.addOperand(indexReg);
+ mcInst.addOperand(displacement);
+
+ if (sr)
+ mcInst.addOperand(segmentReg);
+
+ return false;
+}
+
+/// translateRM - Translates an operand stored in the R/M (and possibly SIB)
+/// byte of an instruction to LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param operand - The operand, as stored in the descriptor table.
+/// @param insn - The instruction to extract Mod, R/M, and SIB fields
+/// from.
+/// @return - 0 on success; nonzero otherwise
+static bool translateRM(MCInst &mcInst,
+ OperandSpecifier &operand,
+ InternalInstruction &insn) {
+ switch (operand.type) {
+ default:
+ debug("Unexpected type for a R/M operand");
+ return true;
+ case TYPE_R8:
+ case TYPE_R16:
+ case TYPE_R32:
+ case TYPE_R64:
+ case TYPE_Rv:
+ case TYPE_MM:
+ case TYPE_MM32:
+ case TYPE_MM64:
+ case TYPE_XMM:
+ case TYPE_XMM32:
+ case TYPE_XMM64:
+ case TYPE_XMM128:
+ case TYPE_DEBUGREG:
+ case TYPE_CONTROLREG:
+ return translateRMRegister(mcInst, insn);
+ case TYPE_M:
+ case TYPE_M8:
+ case TYPE_M16:
+ case TYPE_M32:
+ case TYPE_M64:
+ case TYPE_M128:
+ case TYPE_M512:
+ case TYPE_Mv:
+ case TYPE_M32FP:
+ case TYPE_M64FP:
+ case TYPE_M80FP:
+ case TYPE_M16INT:
+ case TYPE_M32INT:
+ case TYPE_M64INT:
+ case TYPE_M1616:
+ case TYPE_M1632:
+ case TYPE_M1664:
+ return translateRMMemory(mcInst, insn, true);
+ case TYPE_LEA:
+ return translateRMMemory(mcInst, insn, false);
+ }
+}
+
+/// translateFPRegister - Translates a stack position on the FPU stack to its
+/// LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param stackPos - The stack position to translate.
+/// @return - 0 on success; nonzero otherwise.
+static bool translateFPRegister(MCInst &mcInst,
+ uint8_t stackPos) {
+ if (stackPos >= 8) {
+ debug("Invalid FP stack position");
+ return true;
+ }
+
+ mcInst.addOperand(MCOperand::CreateReg(X86::ST0 + stackPos));
+
+ return false;
+}
+
+/// translateOperand - Translates an operand stored in an internal instruction
+/// to LLVM's format and appends it to an MCInst.
+///
+/// @param mcInst - The MCInst to append to.
+/// @param operand - The operand, as stored in the descriptor table.
+/// @param insn - The internal instruction.
+/// @return - false on success; true otherwise.
+static bool translateOperand(MCInst &mcInst,
+ OperandSpecifier &operand,
+ InternalInstruction &insn) {
+ switch (operand.encoding) {
+ default:
+ debug("Unhandled operand encoding during translation");
+ return true;
+ case ENCODING_REG:
+ translateRegister(mcInst, insn.reg);
+ return false;
+ case ENCODING_RM:
+ return translateRM(mcInst, operand, insn);
+ case ENCODING_CB:
+ case ENCODING_CW:
+ case ENCODING_CD:
+ case ENCODING_CP:
+ case ENCODING_CO:
+ case ENCODING_CT:
+ debug("Translation of code offsets isn't supported.");
+ return true;
+ case ENCODING_IB:
+ case ENCODING_IW:
+ case ENCODING_ID:
+ case ENCODING_IO:
+ case ENCODING_Iv:
+ case ENCODING_Ia:
+ translateImmediate(mcInst,
+ insn.immediates[insn.numImmediatesTranslated++],
+ operand,
+ insn);
+ return false;
+ case ENCODING_RB:
+ case ENCODING_RW:
+ case ENCODING_RD:
+ case ENCODING_RO:
+ translateRegister(mcInst, insn.opcodeRegister);
+ return false;
+ case ENCODING_I:
+ return translateFPRegister(mcInst, insn.opcodeModifier);
+ case ENCODING_Rv:
+ translateRegister(mcInst, insn.opcodeRegister);
+ return false;
+ case ENCODING_DUP:
+ return translateOperand(mcInst,
+ insn.spec->operands[operand.type - TYPE_DUP0],
+ insn);
+ }
+}
+
+/// translateInstruction - Translates an internal instruction and all its
+/// operands to an MCInst.
+///
+/// @param mcInst - The MCInst to populate with the instruction's data.
+/// @param insn - The internal instruction.
+/// @return - false on success; true otherwise.
+static bool translateInstruction(MCInst &mcInst,
+ InternalInstruction &insn) {
+ if (!insn.spec) {
+ debug("Instruction has no specification");
+ return true;
+ }
+
+ mcInst.setOpcode(insn.instructionID);
+
+ int index;
+
+ insn.numImmediatesTranslated = 0;
+
+ for (index = 0; index < X86_MAX_OPERANDS; ++index) {
+ if (insn.spec->operands[index].encoding != ENCODING_NONE) {
+ if (translateOperand(mcInst, insn.spec->operands[index], insn)) {
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+static MCDisassembler *createX86_32Disassembler(const Target &T) {
+ return new X86Disassembler::X86_32Disassembler;
+}
+
+static MCDisassembler *createX86_64Disassembler(const Target &T) {
+ return new X86Disassembler::X86_64Disassembler;
+}
+
+extern "C" void LLVMInitializeX86Disassembler() {
+ // Register the disassembler.
+ TargetRegistry::RegisterMCDisassembler(TheX86_32Target,
+ createX86_32Disassembler);
+ TargetRegistry::RegisterMCDisassembler(TheX86_64Target,
+ createX86_64Disassembler);
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h
new file mode 100644
index 0000000..9c54262
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -0,0 +1,155 @@
+//===- X86Disassembler.h - Disassembler for x86 and x86_64 ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The X86 disassembler is a table-driven disassembler for the 16-, 32-, and
+// 64-bit X86 instruction sets. The main decode sequence for an assembly
+// instruction in this disassembler is:
+//
+// 1. Read the prefix bytes and determine the attributes of the instruction.
+// These attributes, recorded in enum attributeBits
+// (X86DisassemblerDecoderCommon.h), form a bitmask. The table CONTEXTS_SYM
+// provides a mapping from bitmasks to contexts, which are represented by
+// enum InstructionContext (ibid.).
+//
+// 2. Read the opcode, and determine what kind of opcode it is. The
+// disassembler distinguishes four kinds of opcodes, which are enumerated in
+// OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte
+// (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a
+// (0x0f 0x3a 0xnn). Mandatory prefixes are treated as part of the context.
+//
+// 3. Depending on the opcode type, look in one of four ClassDecision structures
+// (X86DisassemblerDecoderCommon.h). Use the opcode class to determine which
+// OpcodeDecision (ibid.) to look the opcode in. Look up the opcode, to get
+// a ModRMDecision (ibid.).
+//
+// 4. Some instructions, such as escape opcodes or extended opcodes, or even
+// instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the
+// ModR/M byte to complete decode. The ModRMDecision's type is an entry from
+// ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the
+// ModR/M byte is required and how to interpret it.
+//
+// 5. After resolving the ModRMDecision, the disassembler has a unique ID
+// of type InstrUID (X86DisassemblerDecoderCommon.h). Looking this ID up in
+// INSTRUCTIONS_SYM yields the name of the instruction and the encodings and
+// meanings of its operands.
+//
+// 6. For each operand, its encoding is an entry from OperandEncoding
+// (X86DisassemblerDecoderCommon.h) and its type is an entry from
+// OperandType (ibid.). The encoding indicates how to read it from the
+// instruction; the type indicates how to interpret the value once it has
+// been read. For example, a register operand could be stored in the R/M
+// field of the ModR/M byte, the REG field of the ModR/M byte, or added to
+// the main opcode. This is orthogonal from its meaning (an GPR or an XMM
+// register, for instance). Given this information, the operands can be
+// extracted and interpreted.
+//
+// 7. As the last step, the disassembler translates the instruction information
+// and operands into a format understandable by the client - in this case, an
+// MCInst for use by the MC infrastructure.
+//
+// The disassembler is broken broadly into two parts: the table emitter that
+// emits the instruction decode tables discussed above during compilation, and
+// the disassembler itself. The table emitter is documented in more detail in
+// utils/TableGen/X86DisassemblerEmitter.h.
+//
+// X86Disassembler.h contains the public interface for the disassembler,
+// adhering to the MCDisassembler interface.
+// X86Disassembler.cpp contains the code responsible for step 7, and for
+// invoking the decoder to execute steps 1-6.
+// X86DisassemblerDecoderCommon.h contains the definitions needed by both the
+// table emitter and the disassembler.
+// X86DisassemblerDecoder.h contains the public interface of the decoder,
+// factored out into C for possible use by other projects.
+// X86DisassemblerDecoder.c contains the source code of the decoder, which is
+// responsible for steps 1-6.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86DISASSEMBLER_H
+#define X86DISASSEMBLER_H
+
+#define INSTRUCTION_SPECIFIER_FIELDS \
+ const char* name;
+
+#define INSTRUCTION_IDS \
+ InstrUID* instructionIDs;
+
+#include "X86DisassemblerDecoderCommon.h"
+
+#undef INSTRUCTION_SPECIFIER_FIELDS
+#undef INSTRUCTION_IDS
+
+#include "llvm/MC/MCDisassembler.h"
+
+struct InternalInstruction;
+
+namespace llvm {
+
+class MCInst;
+class MemoryObject;
+class raw_ostream;
+
+struct EDInstInfo;
+
+namespace X86Disassembler {
+
+/// X86GenericDisassembler - Generic disassembler for all X86 platforms.
+/// All each platform class should have to do is subclass the constructor, and
+/// provide a different disassemblerMode value.
+class X86GenericDisassembler : public MCDisassembler {
+protected:
+ /// Constructor - Initializes the disassembler.
+ ///
+ /// @param mode - The X86 architecture mode to decode for.
+ X86GenericDisassembler(DisassemblerMode mode);
+public:
+ ~X86GenericDisassembler();
+
+ /// getInstruction - See MCDisassembler.
+ bool getInstruction(MCInst &instr,
+ uint64_t &size,
+ const MemoryObject &region,
+ uint64_t address,
+ raw_ostream &vStream) const;
+
+ /// getEDInfo - See MCDisassembler.
+ EDInstInfo *getEDInfo() const;
+private:
+ DisassemblerMode fMode;
+};
+
+/// X86_16Disassembler - 16-bit X86 disassembler.
+class X86_16Disassembler : public X86GenericDisassembler {
+public:
+ X86_16Disassembler() :
+ X86GenericDisassembler(MODE_16BIT) {
+ }
+};
+
+/// X86_16Disassembler - 32-bit X86 disassembler.
+class X86_32Disassembler : public X86GenericDisassembler {
+public:
+ X86_32Disassembler() :
+ X86GenericDisassembler(MODE_32BIT) {
+ }
+};
+
+/// X86_16Disassembler - 64-bit X86 disassembler.
+class X86_64Disassembler : public X86GenericDisassembler {
+public:
+ X86_64Disassembler() :
+ X86GenericDisassembler(MODE_64BIT) {
+ }
+};
+
+} // namespace X86Disassembler
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
new file mode 100644
index 0000000..6c3ff6b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -0,0 +1,1388 @@
+/*===- X86DisassemblerDecoder.c - Disassembler decoder -------------*- C -*-==*
+ *
+ * The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file is part of the X86 Disassembler.
+ * It contains the implementation of the instruction decoder.
+ * Documentation for the disassembler can be found in X86Disassembler.h.
+ *
+ *===----------------------------------------------------------------------===*/
+
+#include <stdarg.h> /* for va_*() */
+#include <stdio.h> /* for vsnprintf() */
+#include <stdlib.h> /* for exit() */
+#include <string.h> /* for memset() */
+
+#include "X86DisassemblerDecoder.h"
+
+#include "X86GenDisassemblerTables.inc"
+
+#define TRUE 1
+#define FALSE 0
+
+typedef int8_t bool;
+
+#ifdef __GNUC__
+#define NORETURN __attribute__((noreturn))
+#else
+#define NORETURN
+#endif
+
+#ifndef NDEBUG
+#define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0)
+#else
+#define debug(s) do { } while (0)
+#endif
+
+
+/*
+ * contextForAttrs - Client for the instruction context table. Takes a set of
+ * attributes and returns the appropriate decode context.
+ *
+ * @param attrMask - Attributes, from the enumeration attributeBits.
+ * @return - The InstructionContext to use when looking up an
+ * an instruction with these attributes.
+ */
+static InstructionContext contextForAttrs(uint8_t attrMask) {
+ return CONTEXTS_SYM[attrMask];
+}
+
+/*
+ * modRMRequired - Reads the appropriate instruction table to determine whether
+ * the ModR/M byte is required to decode a particular instruction.
+ *
+ * @param type - The opcode type (i.e., how many bytes it has).
+ * @param insnContext - The context for the instruction, as returned by
+ * contextForAttrs.
+ * @param opcode - The last byte of the instruction's opcode, not counting
+ * ModR/M extensions and escapes.
+ * @return - TRUE if the ModR/M byte is required, FALSE otherwise.
+ */
+static int modRMRequired(OpcodeType type,
+ InstructionContext insnContext,
+ uint8_t opcode) {
+ const struct ContextDecision* decision = 0;
+
+ switch (type) {
+ case ONEBYTE:
+ decision = &ONEBYTE_SYM;
+ break;
+ case TWOBYTE:
+ decision = &TWOBYTE_SYM;
+ break;
+ case THREEBYTE_38:
+ decision = &THREEBYTE38_SYM;
+ break;
+ case THREEBYTE_3A:
+ decision = &THREEBYTE3A_SYM;
+ break;
+ }
+
+ return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
+ modrm_type != MODRM_ONEENTRY;
+
+ return 0;
+}
+
+/*
+ * decode - Reads the appropriate instruction table to obtain the unique ID of
+ * an instruction.
+ *
+ * @param type - See modRMRequired().
+ * @param insnContext - See modRMRequired().
+ * @param opcode - See modRMRequired().
+ * @param modRM - The ModR/M byte if required, or any value if not.
+ * @return - The UID of the instruction, or 0 on failure.
+ */
+static InstrUID decode(OpcodeType type,
+ InstructionContext insnContext,
+ uint8_t opcode,
+ uint8_t modRM) {
+ struct ModRMDecision* dec;
+
+ switch (type) {
+ default:
+ debug("Unknown opcode type");
+ return 0;
+ case ONEBYTE:
+ dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case TWOBYTE:
+ dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEBYTE_38:
+ dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEBYTE_3A:
+ dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ }
+
+ switch (dec->modrm_type) {
+ default:
+ debug("Corrupt table! Unknown modrm_type");
+ return 0;
+ case MODRM_ONEENTRY:
+ return dec->instructionIDs[0];
+ case MODRM_SPLITRM:
+ if (modFromModRM(modRM) == 0x3)
+ return dec->instructionIDs[1];
+ else
+ return dec->instructionIDs[0];
+ case MODRM_FULL:
+ return dec->instructionIDs[modRM];
+ }
+}
+
+/*
+ * specifierForUID - Given a UID, returns the name and operand specification for
+ * that instruction.
+ *
+ * @param uid - The unique ID for the instruction. This should be returned by
+ * decode(); specifierForUID will not check bounds.
+ * @return - A pointer to the specification for that instruction.
+ */
+static struct InstructionSpecifier* specifierForUID(InstrUID uid) {
+ return &INSTRUCTIONS_SYM[uid];
+}
+
+/*
+ * consumeByte - Uses the reader function provided by the user to consume one
+ * byte from the instruction's memory and advance the cursor.
+ *
+ * @param insn - The instruction with the reader function to use. The cursor
+ * for this instruction is advanced.
+ * @param byte - A pointer to a pre-allocated memory buffer to be populated
+ * with the data read.
+ * @return - 0 if the read was successful; nonzero otherwise.
+ */
+static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) {
+ int ret = insn->reader(insn->readerArg, byte, insn->readerCursor);
+
+ if (!ret)
+ ++(insn->readerCursor);
+
+ return ret;
+}
+
+/*
+ * lookAtByte - Like consumeByte, but does not advance the cursor.
+ *
+ * @param insn - See consumeByte().
+ * @param byte - See consumeByte().
+ * @return - See consumeByte().
+ */
+static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) {
+ return insn->reader(insn->readerArg, byte, insn->readerCursor);
+}
+
+static void unconsumeByte(struct InternalInstruction* insn) {
+ insn->readerCursor--;
+}
+
+#define CONSUME_FUNC(name, type) \
+ static int name(struct InternalInstruction* insn, type* ptr) { \
+ type combined = 0; \
+ unsigned offset; \
+ for (offset = 0; offset < sizeof(type); ++offset) { \
+ uint8_t byte; \
+ int ret = insn->reader(insn->readerArg, \
+ &byte, \
+ insn->readerCursor + offset); \
+ if (ret) \
+ return ret; \
+ combined = combined | ((type)byte << ((type)offset * 8)); \
+ } \
+ *ptr = combined; \
+ insn->readerCursor += sizeof(type); \
+ return 0; \
+ }
+
+/*
+ * consume* - Use the reader function provided by the user to consume data
+ * values of various sizes from the instruction's memory and advance the
+ * cursor appropriately. These readers perform endian conversion.
+ *
+ * @param insn - See consumeByte().
+ * @param ptr - A pointer to a pre-allocated memory of appropriate size to
+ * be populated with the data read.
+ * @return - See consumeByte().
+ */
+CONSUME_FUNC(consumeInt8, int8_t)
+CONSUME_FUNC(consumeInt16, int16_t)
+CONSUME_FUNC(consumeInt32, int32_t)
+CONSUME_FUNC(consumeUInt16, uint16_t)
+CONSUME_FUNC(consumeUInt32, uint32_t)
+CONSUME_FUNC(consumeUInt64, uint64_t)
+
+/*
+ * dbgprintf - Uses the logging function provided by the user to log a single
+ * message, typically without a carriage-return.
+ *
+ * @param insn - The instruction containing the logging function.
+ * @param format - See printf().
+ * @param ... - See printf().
+ */
+static void dbgprintf(struct InternalInstruction* insn,
+ const char* format,
+ ...) {
+ char buffer[256];
+ va_list ap;
+
+ if (!insn->dlog)
+ return;
+
+ va_start(ap, format);
+ (void)vsnprintf(buffer, sizeof(buffer), format, ap);
+ va_end(ap);
+
+ insn->dlog(insn->dlogArg, buffer);
+
+ return;
+}
+
+/*
+ * setPrefixPresent - Marks that a particular prefix is present at a particular
+ * location.
+ *
+ * @param insn - The instruction to be marked as having the prefix.
+ * @param prefix - The prefix that is present.
+ * @param location - The location where the prefix is located (in the address
+ * space of the instruction's reader).
+ */
+static void setPrefixPresent(struct InternalInstruction* insn,
+ uint8_t prefix,
+ uint64_t location)
+{
+ insn->prefixPresent[prefix] = 1;
+ insn->prefixLocations[prefix] = location;
+}
+
+/*
+ * isPrefixAtLocation - Queries an instruction to determine whether a prefix is
+ * present at a given location.
+ *
+ * @param insn - The instruction to be queried.
+ * @param prefix - The prefix.
+ * @param location - The location to query.
+ * @return - Whether the prefix is at that location.
+ */
+static BOOL isPrefixAtLocation(struct InternalInstruction* insn,
+ uint8_t prefix,
+ uint64_t location)
+{
+ if (insn->prefixPresent[prefix] == 1 &&
+ insn->prefixLocations[prefix] == location)
+ return TRUE;
+ else
+ return FALSE;
+}
+
+/*
+ * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the
+ * instruction as having them. Also sets the instruction's default operand,
+ * address, and other relevant data sizes to report operands correctly.
+ *
+ * @param insn - The instruction whose prefixes are to be read.
+ * @return - 0 if the instruction could be read until the end of the prefix
+ * bytes, and no prefixes conflicted; nonzero otherwise.
+ */
+static int readPrefixes(struct InternalInstruction* insn) {
+ BOOL isPrefix = TRUE;
+ BOOL prefixGroups[4] = { FALSE };
+ uint64_t prefixLocation;
+ uint8_t byte;
+
+ BOOL hasAdSize = FALSE;
+ BOOL hasOpSize = FALSE;
+
+ dbgprintf(insn, "readPrefixes()");
+
+ while (isPrefix) {
+ prefixLocation = insn->readerCursor;
+
+ if (consumeByte(insn, &byte))
+ return -1;
+
+ switch (byte) {
+ case 0xf0: /* LOCK */
+ case 0xf2: /* REPNE/REPNZ */
+ case 0xf3: /* REP or REPE/REPZ */
+ if (prefixGroups[0])
+ dbgprintf(insn, "Redundant Group 1 prefix");
+ prefixGroups[0] = TRUE;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ case 0x2e: /* CS segment override -OR- Branch not taken */
+ case 0x36: /* SS segment override -OR- Branch taken */
+ case 0x3e: /* DS segment override */
+ case 0x26: /* ES segment override */
+ case 0x64: /* FS segment override */
+ case 0x65: /* GS segment override */
+ switch (byte) {
+ case 0x2e:
+ insn->segmentOverride = SEG_OVERRIDE_CS;
+ break;
+ case 0x36:
+ insn->segmentOverride = SEG_OVERRIDE_SS;
+ break;
+ case 0x3e:
+ insn->segmentOverride = SEG_OVERRIDE_DS;
+ break;
+ case 0x26:
+ insn->segmentOverride = SEG_OVERRIDE_ES;
+ break;
+ case 0x64:
+ insn->segmentOverride = SEG_OVERRIDE_FS;
+ break;
+ case 0x65:
+ insn->segmentOverride = SEG_OVERRIDE_GS;
+ break;
+ default:
+ debug("Unhandled override");
+ return -1;
+ }
+ if (prefixGroups[1])
+ dbgprintf(insn, "Redundant Group 2 prefix");
+ prefixGroups[1] = TRUE;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ case 0x66: /* Operand-size override */
+ if (prefixGroups[2])
+ dbgprintf(insn, "Redundant Group 3 prefix");
+ prefixGroups[2] = TRUE;
+ hasOpSize = TRUE;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ case 0x67: /* Address-size override */
+ if (prefixGroups[3])
+ dbgprintf(insn, "Redundant Group 4 prefix");
+ prefixGroups[3] = TRUE;
+ hasAdSize = TRUE;
+ setPrefixPresent(insn, byte, prefixLocation);
+ break;
+ default: /* Not a prefix byte */
+ isPrefix = FALSE;
+ break;
+ }
+
+ if (isPrefix)
+ dbgprintf(insn, "Found prefix 0x%hhx", byte);
+ }
+
+ if (insn->mode == MODE_64BIT) {
+ if ((byte & 0xf0) == 0x40) {
+ uint8_t opcodeByte;
+
+ if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
+ dbgprintf(insn, "Redundant REX prefix");
+ return -1;
+ }
+
+ insn->rexPrefix = byte;
+ insn->necessaryPrefixLocation = insn->readerCursor - 2;
+
+ dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
+ } else {
+ unconsumeByte(insn);
+ insn->necessaryPrefixLocation = insn->readerCursor - 1;
+ }
+ } else {
+ unconsumeByte(insn);
+ }
+
+ if (insn->mode == MODE_16BIT) {
+ insn->registerSize = (hasOpSize ? 4 : 2);
+ insn->addressSize = (hasAdSize ? 4 : 2);
+ insn->displacementSize = (hasAdSize ? 4 : 2);
+ insn->immediateSize = (hasOpSize ? 4 : 2);
+ } else if (insn->mode == MODE_32BIT) {
+ insn->registerSize = (hasOpSize ? 2 : 4);
+ insn->addressSize = (hasAdSize ? 2 : 4);
+ insn->displacementSize = (hasAdSize ? 2 : 4);
+ insn->immediateSize = (hasAdSize ? 2 : 4);
+ } else if (insn->mode == MODE_64BIT) {
+ if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
+ insn->registerSize = 8;
+ insn->addressSize = (hasAdSize ? 4 : 8);
+ insn->displacementSize = 4;
+ insn->immediateSize = 4;
+ } else if (insn->rexPrefix) {
+ insn->registerSize = (hasOpSize ? 2 : 4);
+ insn->addressSize = (hasAdSize ? 4 : 8);
+ insn->displacementSize = (hasOpSize ? 2 : 4);
+ insn->immediateSize = (hasOpSize ? 2 : 4);
+ } else {
+ insn->registerSize = (hasOpSize ? 2 : 4);
+ insn->addressSize = (hasAdSize ? 4 : 8);
+ insn->displacementSize = (hasOpSize ? 2 : 4);
+ insn->immediateSize = (hasOpSize ? 2 : 4);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
+ * extended or escape opcodes).
+ *
+ * @param insn - The instruction whose opcode is to be read.
+ * @return - 0 if the opcode could be read successfully; nonzero otherwise.
+ */
+static int readOpcode(struct InternalInstruction* insn) {
+ /* Determine the length of the primary opcode */
+
+ uint8_t current;
+
+ dbgprintf(insn, "readOpcode()");
+
+ insn->opcodeType = ONEBYTE;
+ if (consumeByte(insn, &current))
+ return -1;
+
+ if (current == 0x0f) {
+ dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
+
+ insn->twoByteEscape = current;
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ if (current == 0x38) {
+ dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+ insn->threeByteEscape = current;
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ insn->opcodeType = THREEBYTE_38;
+ } else if (current == 0x3a) {
+ dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+ insn->threeByteEscape = current;
+
+ if (consumeByte(insn, &current))
+ return -1;
+
+ insn->opcodeType = THREEBYTE_3A;
+ } else {
+ dbgprintf(insn, "Didn't find a three-byte escape prefix");
+
+ insn->opcodeType = TWOBYTE;
+ }
+ }
+
+ /*
+ * At this point we have consumed the full opcode.
+ * Anything we consume from here on must be unconsumed.
+ */
+
+ insn->opcode = current;
+
+ return 0;
+}
+
+static int readModRM(struct InternalInstruction* insn);
+
+/*
+ * getIDWithAttrMask - Determines the ID of an instruction, consuming
+ * the ModR/M byte as appropriate for extended and escape opcodes,
+ * and using a supplied attribute mask.
+ *
+ * @param instructionID - A pointer whose target is filled in with the ID of the
+ * instruction.
+ * @param insn - The instruction whose ID is to be determined.
+ * @param attrMask - The attribute mask to search.
+ * @return - 0 if the ModR/M could be read when needed or was not
+ * needed; nonzero otherwise.
+ */
+static int getIDWithAttrMask(uint16_t* instructionID,
+ struct InternalInstruction* insn,
+ uint8_t attrMask) {
+ BOOL hasModRMExtension;
+
+ uint8_t instructionClass;
+
+ instructionClass = contextForAttrs(attrMask);
+
+ hasModRMExtension = modRMRequired(insn->opcodeType,
+ instructionClass,
+ insn->opcode);
+
+ if (hasModRMExtension) {
+ readModRM(insn);
+
+ *instructionID = decode(insn->opcodeType,
+ instructionClass,
+ insn->opcode,
+ insn->modRM);
+ } else {
+ *instructionID = decode(insn->opcodeType,
+ instructionClass,
+ insn->opcode,
+ 0);
+ }
+
+ return 0;
+}
+
+/*
+ * is16BitEquivalent - Determines whether two instruction names refer to
+ * equivalent instructions but one is 16-bit whereas the other is not.
+ *
+ * @param orig - The instruction that is not 16-bit
+ * @param equiv - The instruction that is 16-bit
+ */
+static BOOL is16BitEquvalent(const char* orig, const char* equiv) {
+ off_t i;
+
+ for (i = 0;; i++) {
+ if (orig[i] == '\0' && equiv[i] == '\0')
+ return TRUE;
+ if (orig[i] == '\0' || equiv[i] == '\0')
+ return FALSE;
+ if (orig[i] != equiv[i]) {
+ if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
+ continue;
+ if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
+ continue;
+ if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
+ continue;
+ return FALSE;
+ }
+ }
+}
+
+/*
+ * is64BitEquivalent - Determines whether two instruction names refer to
+ * equivalent instructions but one is 64-bit whereas the other is not.
+ *
+ * @param orig - The instruction that is not 64-bit
+ * @param equiv - The instruction that is 64-bit
+ */
+static BOOL is64BitEquivalent(const char* orig, const char* equiv) {
+ off_t i;
+
+ for (i = 0;; i++) {
+ if (orig[i] == '\0' && equiv[i] == '\0')
+ return TRUE;
+ if (orig[i] == '\0' || equiv[i] == '\0')
+ return FALSE;
+ if (orig[i] != equiv[i]) {
+ if ((orig[i] == 'W' || orig[i] == 'L') && equiv[i] == 'Q')
+ continue;
+ if ((orig[i] == '1' || orig[i] == '3') && equiv[i] == '6')
+ continue;
+ if ((orig[i] == '6' || orig[i] == '2') && equiv[i] == '4')
+ continue;
+ return FALSE;
+ }
+ }
+}
+
+
+/*
+ * getID - Determines the ID of an instruction, consuming the ModR/M byte as
+ * appropriate for extended and escape opcodes. Determines the attributes and
+ * context for the instruction before doing so.
+ *
+ * @param insn - The instruction whose ID is to be determined.
+ * @return - 0 if the ModR/M could be read when needed or was not needed;
+ * nonzero otherwise.
+ */
+static int getID(struct InternalInstruction* insn) {
+ uint8_t attrMask;
+ uint16_t instructionID;
+
+ dbgprintf(insn, "getID()");
+
+ attrMask = ATTR_NONE;
+
+ if (insn->mode == MODE_64BIT)
+ attrMask |= ATTR_64BIT;
+
+ if (insn->rexPrefix & 0x08)
+ attrMask |= ATTR_REXW;
+
+ if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_OPSIZE;
+ else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_XS;
+ else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
+ attrMask |= ATTR_XD;
+
+ if (getIDWithAttrMask(&instructionID, insn, attrMask))
+ return -1;
+
+ /* The following clauses compensate for limitations of the tables. */
+
+ if ((attrMask & ATTR_XD) && (attrMask & ATTR_REXW)) {
+ /*
+ * Although for SSE instructions it is usually necessary to treat REX.W+F2
+ * as F2 for decode (in the absence of a 64BIT_REXW_XD category) there is
+ * an occasional instruction where F2 is incidental and REX.W is the more
+ * significant. If the decoded instruction is 32-bit and adding REX.W
+ * instead of F2 changes a 32 to a 64, we adopt the new encoding.
+ */
+
+ struct InstructionSpecifier* spec;
+ uint16_t instructionIDWithREXw;
+ struct InstructionSpecifier* specWithREXw;
+
+ spec = specifierForUID(instructionID);
+
+ if (getIDWithAttrMask(&instructionIDWithREXw,
+ insn,
+ attrMask & (~ATTR_XD))) {
+ /*
+ * Decoding with REX.w would yield nothing; give up and return original
+ * decode.
+ */
+
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ return 0;
+ }
+
+ specWithREXw = specifierForUID(instructionIDWithREXw);
+
+ if (is64BitEquivalent(spec->name, specWithREXw->name)) {
+ insn->instructionID = instructionIDWithREXw;
+ insn->spec = specWithREXw;
+ } else {
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ }
+ return 0;
+ }
+
+ if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) {
+ /*
+ * The instruction tables make no distinction between instructions that
+ * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
+ * particular spot (i.e., many MMX operations). In general we're
+ * conservative, but in the specific case where OpSize is present but not
+ * in the right place we check if there's a 16-bit operation.
+ */
+
+ struct InstructionSpecifier* spec;
+ uint16_t instructionIDWithOpsize;
+ struct InstructionSpecifier* specWithOpsize;
+
+ spec = specifierForUID(instructionID);
+
+ if (getIDWithAttrMask(&instructionIDWithOpsize,
+ insn,
+ attrMask | ATTR_OPSIZE)) {
+ /*
+ * ModRM required with OpSize but not present; give up and return version
+ * without OpSize set
+ */
+
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ return 0;
+ }
+
+ specWithOpsize = specifierForUID(instructionIDWithOpsize);
+
+ if (is16BitEquvalent(spec->name, specWithOpsize->name)) {
+ insn->instructionID = instructionIDWithOpsize;
+ insn->spec = specWithOpsize;
+ } else {
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ }
+ return 0;
+ }
+
+ insn->instructionID = instructionID;
+ insn->spec = specifierForUID(insn->instructionID);
+
+ return 0;
+}
+
+/*
+ * readSIB - Consumes the SIB byte to determine addressing information for an
+ * instruction.
+ *
+ * @param insn - The instruction whose SIB byte is to be read.
+ * @return - 0 if the SIB byte was successfully read; nonzero otherwise.
+ */
+static int readSIB(struct InternalInstruction* insn) {
+ SIBIndex sibIndexBase = 0;
+ SIBBase sibBaseBase = 0;
+ uint8_t index, base;
+
+ dbgprintf(insn, "readSIB()");
+
+ if (insn->consumedSIB)
+ return 0;
+
+ insn->consumedSIB = TRUE;
+
+ switch (insn->addressSize) {
+ case 2:
+ dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
+ return -1;
+ break;
+ case 4:
+ sibIndexBase = SIB_INDEX_EAX;
+ sibBaseBase = SIB_BASE_EAX;
+ break;
+ case 8:
+ sibIndexBase = SIB_INDEX_RAX;
+ sibBaseBase = SIB_BASE_RAX;
+ break;
+ }
+
+ if (consumeByte(insn, &insn->sib))
+ return -1;
+
+ index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
+
+ switch (index) {
+ case 0x4:
+ insn->sibIndex = SIB_INDEX_NONE;
+ break;
+ default:
+ insn->sibIndex = (EABase)(sibIndexBase + index);
+ if (insn->sibIndex == SIB_INDEX_sib ||
+ insn->sibIndex == SIB_INDEX_sib64)
+ insn->sibIndex = SIB_INDEX_NONE;
+ break;
+ }
+
+ switch (scaleFromSIB(insn->sib)) {
+ case 0:
+ insn->sibScale = 1;
+ break;
+ case 1:
+ insn->sibScale = 2;
+ break;
+ case 2:
+ insn->sibScale = 4;
+ break;
+ case 3:
+ insn->sibScale = 8;
+ break;
+ }
+
+ base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
+
+ switch (base) {
+ case 0x5:
+ switch (modFromModRM(insn->modRM)) {
+ case 0x0:
+ insn->eaDisplacement = EA_DISP_32;
+ insn->sibBase = SIB_BASE_NONE;
+ break;
+ case 0x1:
+ insn->eaDisplacement = EA_DISP_8;
+ insn->sibBase = (insn->addressSize == 4 ?
+ SIB_BASE_EBP : SIB_BASE_RBP);
+ break;
+ case 0x2:
+ insn->eaDisplacement = EA_DISP_32;
+ insn->sibBase = (insn->addressSize == 4 ?
+ SIB_BASE_EBP : SIB_BASE_RBP);
+ break;
+ case 0x3:
+ debug("Cannot have Mod = 0b11 and a SIB byte");
+ return -1;
+ }
+ break;
+ default:
+ insn->sibBase = (EABase)(sibBaseBase + base);
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * readDisplacement - Consumes the displacement of an instruction.
+ *
+ * @param insn - The instruction whose displacement is to be read.
+ * @return - 0 if the displacement byte was successfully read; nonzero
+ * otherwise.
+ */
+static int readDisplacement(struct InternalInstruction* insn) {
+ int8_t d8;
+ int16_t d16;
+ int32_t d32;
+
+ dbgprintf(insn, "readDisplacement()");
+
+ if (insn->consumedDisplacement)
+ return 0;
+
+ insn->consumedDisplacement = TRUE;
+
+ switch (insn->eaDisplacement) {
+ case EA_DISP_NONE:
+ insn->consumedDisplacement = FALSE;
+ break;
+ case EA_DISP_8:
+ if (consumeInt8(insn, &d8))
+ return -1;
+ insn->displacement = d8;
+ break;
+ case EA_DISP_16:
+ if (consumeInt16(insn, &d16))
+ return -1;
+ insn->displacement = d16;
+ break;
+ case EA_DISP_32:
+ if (consumeInt32(insn, &d32))
+ return -1;
+ insn->displacement = d32;
+ break;
+ }
+
+ insn->consumedDisplacement = TRUE;
+ return 0;
+}
+
+/*
+ * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and
+ * displacement) for an instruction and interprets it.
+ *
+ * @param insn - The instruction whose addressing information is to be read.
+ * @return - 0 if the information was successfully read; nonzero otherwise.
+ */
+static int readModRM(struct InternalInstruction* insn) {
+ uint8_t mod, rm, reg;
+
+ dbgprintf(insn, "readModRM()");
+
+ if (insn->consumedModRM)
+ return 0;
+
+ consumeByte(insn, &insn->modRM);
+ insn->consumedModRM = TRUE;
+
+ mod = modFromModRM(insn->modRM);
+ rm = rmFromModRM(insn->modRM);
+ reg = regFromModRM(insn->modRM);
+
+ /*
+ * This goes by insn->registerSize to pick the correct register, which messes
+ * up if we're using (say) XMM or 8-bit register operands. That gets fixed in
+ * fixupReg().
+ */
+ switch (insn->registerSize) {
+ case 2:
+ insn->regBase = MODRM_REG_AX;
+ insn->eaRegBase = EA_REG_AX;
+ break;
+ case 4:
+ insn->regBase = MODRM_REG_EAX;
+ insn->eaRegBase = EA_REG_EAX;
+ break;
+ case 8:
+ insn->regBase = MODRM_REG_RAX;
+ insn->eaRegBase = EA_REG_RAX;
+ break;
+ }
+
+ reg |= rFromREX(insn->rexPrefix) << 3;
+ rm |= bFromREX(insn->rexPrefix) << 3;
+
+ insn->reg = (Reg)(insn->regBase + reg);
+
+ switch (insn->addressSize) {
+ case 2:
+ insn->eaBaseBase = EA_BASE_BX_SI;
+
+ switch (mod) {
+ case 0x0:
+ if (rm == 0x6) {
+ insn->eaBase = EA_BASE_NONE;
+ insn->eaDisplacement = EA_DISP_16;
+ if (readDisplacement(insn))
+ return -1;
+ } else {
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_NONE;
+ }
+ break;
+ case 0x1:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_8;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ case 0x2:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_16;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ case 0x3:
+ insn->eaBase = (EABase)(insn->eaRegBase + rm);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ }
+ break;
+ case 4:
+ case 8:
+ insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
+
+ switch (mod) {
+ case 0x0:
+ insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
+ switch (rm) {
+ case 0x4:
+ case 0xc: /* in case REXW.b is set */
+ insn->eaBase = (insn->addressSize == 4 ?
+ EA_BASE_sib : EA_BASE_sib64);
+ readSIB(insn);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ case 0x5:
+ insn->eaBase = EA_BASE_NONE;
+ insn->eaDisplacement = EA_DISP_32;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ default:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ break;
+ }
+ break;
+ case 0x1:
+ case 0x2:
+ insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
+ switch (rm) {
+ case 0x4:
+ case 0xc: /* in case REXW.b is set */
+ insn->eaBase = EA_BASE_sib;
+ readSIB(insn);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ default:
+ insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ }
+ break;
+ case 0x3:
+ insn->eaDisplacement = EA_DISP_NONE;
+ insn->eaBase = (EABase)(insn->eaRegBase + rm);
+ break;
+ }
+ break;
+ } /* switch (insn->addressSize) */
+
+ return 0;
+}
+
+#define GENERIC_FIXUP_FUNC(name, base, prefix) \
+ static uint8_t name(struct InternalInstruction *insn, \
+ OperandType type, \
+ uint8_t index, \
+ uint8_t *valid) { \
+ *valid = 1; \
+ switch (type) { \
+ default: \
+ debug("Unhandled register type"); \
+ *valid = 0; \
+ return 0; \
+ case TYPE_Rv: \
+ return base + index; \
+ case TYPE_R8: \
+ if (insn->rexPrefix && \
+ index >= 4 && index <= 7) { \
+ return prefix##_SPL + (index - 4); \
+ } else { \
+ return prefix##_AL + index; \
+ } \
+ case TYPE_R16: \
+ return prefix##_AX + index; \
+ case TYPE_R32: \
+ return prefix##_EAX + index; \
+ case TYPE_R64: \
+ return prefix##_RAX + index; \
+ case TYPE_XMM128: \
+ case TYPE_XMM64: \
+ case TYPE_XMM32: \
+ case TYPE_XMM: \
+ return prefix##_XMM0 + index; \
+ case TYPE_MM64: \
+ case TYPE_MM32: \
+ case TYPE_MM: \
+ if (index > 7) \
+ *valid = 0; \
+ return prefix##_MM0 + index; \
+ case TYPE_SEGMENTREG: \
+ if (index > 5) \
+ *valid = 0; \
+ return prefix##_ES + index; \
+ case TYPE_DEBUGREG: \
+ if (index > 7) \
+ *valid = 0; \
+ return prefix##_DR0 + index; \
+ case TYPE_CONTROLREG: \
+ if (index > 8) \
+ *valid = 0; \
+ return prefix##_CR0 + index; \
+ } \
+ }
+
+/*
+ * fixup*Value - Consults an operand type to determine the meaning of the
+ * reg or R/M field. If the operand is an XMM operand, for example, an
+ * operand would be XMM0 instead of AX, which readModRM() would otherwise
+ * misinterpret it as.
+ *
+ * @param insn - The instruction containing the operand.
+ * @param type - The operand type.
+ * @param index - The existing value of the field as reported by readModRM().
+ * @param valid - The address of a uint8_t. The target is set to 1 if the
+ * field is valid for the register class; 0 if not.
+ * @return - The proper value.
+ */
+GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG)
+GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG)
+
+/*
+ * fixupReg - Consults an operand specifier to determine which of the
+ * fixup*Value functions to use in correcting readModRM()'ss interpretation.
+ *
+ * @param insn - See fixup*Value().
+ * @param op - The operand specifier.
+ * @return - 0 if fixup was successful; -1 if the register returned was
+ * invalid for its class.
+ */
+static int fixupReg(struct InternalInstruction *insn,
+ struct OperandSpecifier *op) {
+ uint8_t valid;
+
+ dbgprintf(insn, "fixupReg()");
+
+ switch ((OperandEncoding)op->encoding) {
+ default:
+ debug("Expected a REG or R/M encoding in fixupReg");
+ return -1;
+ case ENCODING_REG:
+ insn->reg = (Reg)fixupRegValue(insn,
+ (OperandType)op->type,
+ insn->reg - insn->regBase,
+ &valid);
+ if (!valid)
+ return -1;
+ break;
+ case ENCODING_RM:
+ if (insn->eaBase >= insn->eaRegBase) {
+ insn->eaBase = (EABase)fixupRMValue(insn,
+ (OperandType)op->type,
+ insn->eaBase - insn->eaRegBase,
+ &valid);
+ if (!valid)
+ return -1;
+ }
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * readOpcodeModifier - Reads an operand from the opcode field of an
+ * instruction. Handles AddRegFrm instructions.
+ *
+ * @param insn - The instruction whose opcode field is to be read.
+ * @param inModRM - Indicates that the opcode field is to be read from the
+ * ModR/M extension; useful for escape opcodes
+ * @return - 0 on success; nonzero otherwise.
+ */
+static int readOpcodeModifier(struct InternalInstruction* insn) {
+ dbgprintf(insn, "readOpcodeModifier()");
+
+ if (insn->consumedOpcodeModifier)
+ return 0;
+
+ insn->consumedOpcodeModifier = TRUE;
+
+ switch (insn->spec->modifierType) {
+ default:
+ debug("Unknown modifier type.");
+ return -1;
+ case MODIFIER_NONE:
+ debug("No modifier but an operand expects one.");
+ return -1;
+ case MODIFIER_OPCODE:
+ insn->opcodeModifier = insn->opcode - insn->spec->modifierBase;
+ return 0;
+ case MODIFIER_MODRM:
+ insn->opcodeModifier = insn->modRM - insn->spec->modifierBase;
+ return 0;
+ }
+}
+
+/*
+ * readOpcodeRegister - Reads an operand from the opcode field of an
+ * instruction and interprets it appropriately given the operand width.
+ * Handles AddRegFrm instructions.
+ *
+ * @param insn - See readOpcodeModifier().
+ * @param size - The width (in bytes) of the register being specified.
+ * 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
+ * RAX.
+ * @return - 0 on success; nonzero otherwise.
+ */
+static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
+ dbgprintf(insn, "readOpcodeRegister()");
+
+ if (readOpcodeModifier(insn))
+ return -1;
+
+ if (size == 0)
+ size = insn->registerSize;
+
+ switch (size) {
+ case 1:
+ insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
+ | insn->opcodeModifier));
+ if (insn->rexPrefix &&
+ insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
+ insn->opcodeRegister < MODRM_REG_AL + 0x8) {
+ insn->opcodeRegister = (Reg)(MODRM_REG_SPL
+ + (insn->opcodeRegister - MODRM_REG_AL - 4));
+ }
+
+ break;
+ case 2:
+ insn->opcodeRegister = (Reg)(MODRM_REG_AX
+ + ((bFromREX(insn->rexPrefix) << 3)
+ | insn->opcodeModifier));
+ break;
+ case 4:
+ insn->opcodeRegister = (Reg)(MODRM_REG_EAX
+ + ((bFromREX(insn->rexPrefix) << 3)
+ | insn->opcodeModifier));
+ break;
+ case 8:
+ insn->opcodeRegister = (Reg)(MODRM_REG_RAX
+ + ((bFromREX(insn->rexPrefix) << 3)
+ | insn->opcodeModifier));
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * readImmediate - Consumes an immediate operand from an instruction, given the
+ * desired operand size.
+ *
+ * @param insn - The instruction whose operand is to be read.
+ * @param size - The width (in bytes) of the operand.
+ * @return - 0 if the immediate was successfully consumed; nonzero
+ * otherwise.
+ */
+static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
+ uint8_t imm8;
+ uint16_t imm16;
+ uint32_t imm32;
+ uint64_t imm64;
+
+ dbgprintf(insn, "readImmediate()");
+
+ if (insn->numImmediatesConsumed == 2) {
+ debug("Already consumed two immediates");
+ return -1;
+ }
+
+ if (size == 0)
+ size = insn->immediateSize;
+ else
+ insn->immediateSize = size;
+
+ switch (size) {
+ case 1:
+ if (consumeByte(insn, &imm8))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm8;
+ break;
+ case 2:
+ if (consumeUInt16(insn, &imm16))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm16;
+ break;
+ case 4:
+ if (consumeUInt32(insn, &imm32))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm32;
+ break;
+ case 8:
+ if (consumeUInt64(insn, &imm64))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm64;
+ break;
+ }
+
+ insn->numImmediatesConsumed++;
+
+ return 0;
+}
+
+/*
+ * readOperands - Consults the specifier for an instruction and consumes all
+ * operands for that instruction, interpreting them as it goes.
+ *
+ * @param insn - The instruction whose operands are to be read and interpreted.
+ * @return - 0 if all operands could be read; nonzero otherwise.
+ */
+static int readOperands(struct InternalInstruction* insn) {
+ int index;
+
+ dbgprintf(insn, "readOperands()");
+
+ for (index = 0; index < X86_MAX_OPERANDS; ++index) {
+ switch (insn->spec->operands[index].encoding) {
+ case ENCODING_NONE:
+ break;
+ case ENCODING_REG:
+ case ENCODING_RM:
+ if (readModRM(insn))
+ return -1;
+ if (fixupReg(insn, &insn->spec->operands[index]))
+ return -1;
+ break;
+ case ENCODING_CB:
+ case ENCODING_CW:
+ case ENCODING_CD:
+ case ENCODING_CP:
+ case ENCODING_CO:
+ case ENCODING_CT:
+ dbgprintf(insn, "We currently don't hande code-offset encodings");
+ return -1;
+ case ENCODING_IB:
+ if (readImmediate(insn, 1))
+ return -1;
+ if (insn->spec->operands[index].type == TYPE_IMM3 &&
+ insn->immediates[insn->numImmediatesConsumed - 1] > 7)
+ return -1;
+ break;
+ case ENCODING_IW:
+ if (readImmediate(insn, 2))
+ return -1;
+ break;
+ case ENCODING_ID:
+ if (readImmediate(insn, 4))
+ return -1;
+ break;
+ case ENCODING_IO:
+ if (readImmediate(insn, 8))
+ return -1;
+ break;
+ case ENCODING_Iv:
+ if (readImmediate(insn, insn->immediateSize))
+ return -1;
+ break;
+ case ENCODING_Ia:
+ if (readImmediate(insn, insn->addressSize))
+ return -1;
+ break;
+ case ENCODING_RB:
+ if (readOpcodeRegister(insn, 1))
+ return -1;
+ break;
+ case ENCODING_RW:
+ if (readOpcodeRegister(insn, 2))
+ return -1;
+ break;
+ case ENCODING_RD:
+ if (readOpcodeRegister(insn, 4))
+ return -1;
+ break;
+ case ENCODING_RO:
+ if (readOpcodeRegister(insn, 8))
+ return -1;
+ break;
+ case ENCODING_Rv:
+ if (readOpcodeRegister(insn, 0))
+ return -1;
+ break;
+ case ENCODING_I:
+ if (readOpcodeModifier(insn))
+ return -1;
+ case ENCODING_DUP:
+ break;
+ default:
+ dbgprintf(insn, "Encountered an operand with an unknown encoding.");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * decodeInstruction - Reads and interprets a full instruction provided by the
+ * user.
+ *
+ * @param insn - A pointer to the instruction to be populated. Must be
+ * pre-allocated.
+ * @param reader - The function to be used to read the instruction's bytes.
+ * @param readerArg - A generic argument to be passed to the reader to store
+ * any internal state.
+ * @param logger - If non-NULL, the function to be used to write log messages
+ * and warnings.
+ * @param loggerArg - A generic argument to be passed to the logger to store
+ * any internal state.
+ * @param startLoc - The address (in the reader's address space) of the first
+ * byte in the instruction.
+ * @param mode - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to
+ * decode the instruction in.
+ * @return - 0 if the instruction's memory could be read; nonzero if
+ * not.
+ */
+int decodeInstruction(struct InternalInstruction* insn,
+ byteReader_t reader,
+ void* readerArg,
+ dlog_t logger,
+ void* loggerArg,
+ uint64_t startLoc,
+ DisassemblerMode mode) {
+ memset(insn, 0, sizeof(struct InternalInstruction));
+
+ insn->reader = reader;
+ insn->readerArg = readerArg;
+ insn->dlog = logger;
+ insn->dlogArg = loggerArg;
+ insn->startLocation = startLoc;
+ insn->readerCursor = startLoc;
+ insn->mode = mode;
+ insn->numImmediatesConsumed = 0;
+
+ if (readPrefixes(insn) ||
+ readOpcode(insn) ||
+ getID(insn) ||
+ insn->instructionID == 0 ||
+ readOperands(insn))
+ return -1;
+
+ insn->length = insn->readerCursor - insn->startLocation;
+
+ dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu",
+ startLoc, insn->readerCursor, insn->length);
+
+ if (insn->length > 15)
+ dbgprintf(insn, "Instruction exceeds 15-byte limit");
+
+ return 0;
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
new file mode 100644
index 0000000..28ba86b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -0,0 +1,515 @@
+/*===- X86DisassemblerDecoderInternal.h - Disassembler decoder -----*- C -*-==*
+ *
+ * The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file is part of the X86 Disassembler.
+ * It contains the public interface of the instruction decoder.
+ * Documentation for the disassembler can be found in X86Disassembler.h.
+ *
+ *===----------------------------------------------------------------------===*/
+
+#ifndef X86DISASSEMBLERDECODER_H
+#define X86DISASSEMBLERDECODER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INSTRUCTION_SPECIFIER_FIELDS \
+ const char* name;
+
+#define INSTRUCTION_IDS \
+ InstrUID* instructionIDs;
+
+#include "X86DisassemblerDecoderCommon.h"
+
+#undef INSTRUCTION_SPECIFIER_FIELDS
+#undef INSTRUCTION_IDS
+
+/*
+ * Accessor functions for various fields of an Intel instruction
+ */
+#define modFromModRM(modRM) ((modRM & 0xc0) >> 6)
+#define regFromModRM(modRM) ((modRM & 0x38) >> 3)
+#define rmFromModRM(modRM) (modRM & 0x7)
+#define scaleFromSIB(sib) ((sib & 0xc0) >> 6)
+#define indexFromSIB(sib) ((sib & 0x38) >> 3)
+#define baseFromSIB(sib) (sib & 0x7)
+#define wFromREX(rex) ((rex & 0x8) >> 3)
+#define rFromREX(rex) ((rex & 0x4) >> 2)
+#define xFromREX(rex) ((rex & 0x2) >> 1)
+#define bFromREX(rex) (rex & 0x1)
+
+/*
+ * These enums represent Intel registers for use by the decoder.
+ */
+
+#define REGS_8BIT \
+ ENTRY(AL) \
+ ENTRY(CL) \
+ ENTRY(DL) \
+ ENTRY(BL) \
+ ENTRY(AH) \
+ ENTRY(CH) \
+ ENTRY(DH) \
+ ENTRY(BH) \
+ ENTRY(R8B) \
+ ENTRY(R9B) \
+ ENTRY(R10B) \
+ ENTRY(R11B) \
+ ENTRY(R12B) \
+ ENTRY(R13B) \
+ ENTRY(R14B) \
+ ENTRY(R15B) \
+ ENTRY(SPL) \
+ ENTRY(BPL) \
+ ENTRY(SIL) \
+ ENTRY(DIL)
+
+#define EA_BASES_16BIT \
+ ENTRY(BX_SI) \
+ ENTRY(BX_DI) \
+ ENTRY(BP_SI) \
+ ENTRY(BP_DI) \
+ ENTRY(SI) \
+ ENTRY(DI) \
+ ENTRY(BP) \
+ ENTRY(BX) \
+ ENTRY(R8W) \
+ ENTRY(R9W) \
+ ENTRY(R10W) \
+ ENTRY(R11W) \
+ ENTRY(R12W) \
+ ENTRY(R13W) \
+ ENTRY(R14W) \
+ ENTRY(R15W)
+
+#define REGS_16BIT \
+ ENTRY(AX) \
+ ENTRY(CX) \
+ ENTRY(DX) \
+ ENTRY(BX) \
+ ENTRY(SP) \
+ ENTRY(BP) \
+ ENTRY(SI) \
+ ENTRY(DI) \
+ ENTRY(R8W) \
+ ENTRY(R9W) \
+ ENTRY(R10W) \
+ ENTRY(R11W) \
+ ENTRY(R12W) \
+ ENTRY(R13W) \
+ ENTRY(R14W) \
+ ENTRY(R15W)
+
+#define EA_BASES_32BIT \
+ ENTRY(EAX) \
+ ENTRY(ECX) \
+ ENTRY(EDX) \
+ ENTRY(EBX) \
+ ENTRY(sib) \
+ ENTRY(EBP) \
+ ENTRY(ESI) \
+ ENTRY(EDI) \
+ ENTRY(R8D) \
+ ENTRY(R9D) \
+ ENTRY(R10D) \
+ ENTRY(R11D) \
+ ENTRY(R12D) \
+ ENTRY(R13D) \
+ ENTRY(R14D) \
+ ENTRY(R15D)
+
+#define REGS_32BIT \
+ ENTRY(EAX) \
+ ENTRY(ECX) \
+ ENTRY(EDX) \
+ ENTRY(EBX) \
+ ENTRY(ESP) \
+ ENTRY(EBP) \
+ ENTRY(ESI) \
+ ENTRY(EDI) \
+ ENTRY(R8D) \
+ ENTRY(R9D) \
+ ENTRY(R10D) \
+ ENTRY(R11D) \
+ ENTRY(R12D) \
+ ENTRY(R13D) \
+ ENTRY(R14D) \
+ ENTRY(R15D)
+
+#define EA_BASES_64BIT \
+ ENTRY(RAX) \
+ ENTRY(RCX) \
+ ENTRY(RDX) \
+ ENTRY(RBX) \
+ ENTRY(sib64) \
+ ENTRY(RBP) \
+ ENTRY(RSI) \
+ ENTRY(RDI) \
+ ENTRY(R8) \
+ ENTRY(R9) \
+ ENTRY(R10) \
+ ENTRY(R11) \
+ ENTRY(R12) \
+ ENTRY(R13) \
+ ENTRY(R14) \
+ ENTRY(R15)
+
+#define REGS_64BIT \
+ ENTRY(RAX) \
+ ENTRY(RCX) \
+ ENTRY(RDX) \
+ ENTRY(RBX) \
+ ENTRY(RSP) \
+ ENTRY(RBP) \
+ ENTRY(RSI) \
+ ENTRY(RDI) \
+ ENTRY(R8) \
+ ENTRY(R9) \
+ ENTRY(R10) \
+ ENTRY(R11) \
+ ENTRY(R12) \
+ ENTRY(R13) \
+ ENTRY(R14) \
+ ENTRY(R15)
+
+#define REGS_MMX \
+ ENTRY(MM0) \
+ ENTRY(MM1) \
+ ENTRY(MM2) \
+ ENTRY(MM3) \
+ ENTRY(MM4) \
+ ENTRY(MM5) \
+ ENTRY(MM6) \
+ ENTRY(MM7)
+
+#define REGS_XMM \
+ ENTRY(XMM0) \
+ ENTRY(XMM1) \
+ ENTRY(XMM2) \
+ ENTRY(XMM3) \
+ ENTRY(XMM4) \
+ ENTRY(XMM5) \
+ ENTRY(XMM6) \
+ ENTRY(XMM7) \
+ ENTRY(XMM8) \
+ ENTRY(XMM9) \
+ ENTRY(XMM10) \
+ ENTRY(XMM11) \
+ ENTRY(XMM12) \
+ ENTRY(XMM13) \
+ ENTRY(XMM14) \
+ ENTRY(XMM15)
+
+#define REGS_SEGMENT \
+ ENTRY(ES) \
+ ENTRY(CS) \
+ ENTRY(SS) \
+ ENTRY(DS) \
+ ENTRY(FS) \
+ ENTRY(GS)
+
+#define REGS_DEBUG \
+ ENTRY(DR0) \
+ ENTRY(DR1) \
+ ENTRY(DR2) \
+ ENTRY(DR3) \
+ ENTRY(DR4) \
+ ENTRY(DR5) \
+ ENTRY(DR6) \
+ ENTRY(DR7)
+
+#define REGS_CONTROL \
+ ENTRY(CR0) \
+ ENTRY(CR1) \
+ ENTRY(CR2) \
+ ENTRY(CR3) \
+ ENTRY(CR4) \
+ ENTRY(CR5) \
+ ENTRY(CR6) \
+ ENTRY(CR7) \
+ ENTRY(CR8)
+
+#define ALL_EA_BASES \
+ EA_BASES_16BIT \
+ EA_BASES_32BIT \
+ EA_BASES_64BIT
+
+#define ALL_SIB_BASES \
+ REGS_32BIT \
+ REGS_64BIT
+
+#define ALL_REGS \
+ REGS_8BIT \
+ REGS_16BIT \
+ REGS_32BIT \
+ REGS_64BIT \
+ REGS_MMX \
+ REGS_XMM \
+ REGS_SEGMENT \
+ REGS_DEBUG \
+ REGS_CONTROL \
+ ENTRY(RIP)
+
+/*
+ * EABase - All possible values of the base field for effective-address
+ * computations, a.k.a. the Mod and R/M fields of the ModR/M byte. We
+ * distinguish between bases (EA_BASE_*) and registers that just happen to be
+ * referred to when Mod == 0b11 (EA_REG_*).
+ */
+typedef enum {
+ EA_BASE_NONE,
+#define ENTRY(x) EA_BASE_##x,
+ ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) EA_REG_##x,
+ ALL_REGS
+#undef ENTRY
+ EA_max
+} EABase;
+
+/*
+ * SIBIndex - All possible values of the SIB index field.
+ * Borrows entries from ALL_EA_BASES with the special case that
+ * sib is synonymous with NONE.
+ */
+typedef enum {
+ SIB_INDEX_NONE,
+#define ENTRY(x) SIB_INDEX_##x,
+ ALL_EA_BASES
+#undef ENTRY
+ SIB_INDEX_max
+} SIBIndex;
+
+/*
+ * SIBBase - All possible values of the SIB base field.
+ */
+typedef enum {
+ SIB_BASE_NONE,
+#define ENTRY(x) SIB_BASE_##x,
+ ALL_SIB_BASES
+#undef ENTRY
+ SIB_BASE_max
+} SIBBase;
+
+/*
+ * EADisplacement - Possible displacement types for effective-address
+ * computations.
+ */
+typedef enum {
+ EA_DISP_NONE,
+ EA_DISP_8,
+ EA_DISP_16,
+ EA_DISP_32
+} EADisplacement;
+
+/*
+ * Reg - All possible values of the reg field in the ModR/M byte.
+ */
+typedef enum {
+#define ENTRY(x) MODRM_REG_##x,
+ ALL_REGS
+#undef ENTRY
+ MODRM_REG_max
+} Reg;
+
+/*
+ * SegmentOverride - All possible segment overrides.
+ */
+typedef enum {
+ SEG_OVERRIDE_NONE,
+ SEG_OVERRIDE_CS,
+ SEG_OVERRIDE_SS,
+ SEG_OVERRIDE_DS,
+ SEG_OVERRIDE_ES,
+ SEG_OVERRIDE_FS,
+ SEG_OVERRIDE_GS,
+ SEG_OVERRIDE_max
+} SegmentOverride;
+
+typedef uint8_t BOOL;
+
+/*
+ * byteReader_t - Type for the byte reader that the consumer must provide to
+ * the decoder. Reads a single byte from the instruction's address space.
+ * @param arg - A baton that the consumer can associate with any internal
+ * state that it needs.
+ * @param byte - A pointer to a single byte in memory that should be set to
+ * contain the value at address.
+ * @param address - The address in the instruction's address space that should
+ * be read from.
+ * @return - -1 if the byte cannot be read for any reason; 0 otherwise.
+ */
+typedef int (*byteReader_t)(void* arg, uint8_t* byte, uint64_t address);
+
+/*
+ * dlog_t - Type for the logging function that the consumer can provide to
+ * get debugging output from the decoder.
+ * @param arg - A baton that the consumer can associate with any internal
+ * state that it needs.
+ * @param log - A string that contains the message. Will be reused after
+ * the logger returns.
+ */
+typedef void (*dlog_t)(void* arg, const char *log);
+
+/*
+ * The x86 internal instruction, which is produced by the decoder.
+ */
+struct InternalInstruction {
+ /* Reader interface (C) */
+ byteReader_t reader;
+ /* Opaque value passed to the reader */
+ void* readerArg;
+ /* The address of the next byte to read via the reader */
+ uint64_t readerCursor;
+
+ /* Logger interface (C) */
+ dlog_t dlog;
+ /* Opaque value passed to the logger */
+ void* dlogArg;
+
+ /* General instruction information */
+
+ /* The mode to disassemble for (64-bit, protected, real) */
+ DisassemblerMode mode;
+ /* The start of the instruction, usable with the reader */
+ uint64_t startLocation;
+ /* The length of the instruction, in bytes */
+ size_t length;
+
+ /* Prefix state */
+
+ /* 1 if the prefix byte corresponding to the entry is present; 0 if not */
+ uint8_t prefixPresent[0x100];
+ /* contains the location (for use with the reader) of the prefix byte */
+ uint64_t prefixLocations[0x100];
+ /* The value of the REX prefix, if present */
+ uint8_t rexPrefix;
+ /* The location of the REX prefix */
+ uint64_t rexLocation;
+ /* The location where a mandatory prefix would have to be (i.e., right before
+ the opcode, or right before the REX prefix if one is present) */
+ uint64_t necessaryPrefixLocation;
+ /* The segment override type */
+ SegmentOverride segmentOverride;
+
+ /* Sizes of various critical pieces of data */
+ uint8_t registerSize;
+ uint8_t addressSize;
+ uint8_t displacementSize;
+ uint8_t immediateSize;
+
+ /* opcode state */
+
+ /* The value of the two-byte escape prefix (usually 0x0f) */
+ uint8_t twoByteEscape;
+ /* The value of the three-byte escape prefix (usually 0x38 or 0x3a) */
+ uint8_t threeByteEscape;
+ /* The last byte of the opcode, not counting any ModR/M extension */
+ uint8_t opcode;
+ /* The ModR/M byte of the instruction, if it is an opcode extension */
+ uint8_t modRMExtension;
+
+ /* decode state */
+
+ /* The type of opcode, used for indexing into the array of decode tables */
+ OpcodeType opcodeType;
+ /* The instruction ID, extracted from the decode table */
+ uint16_t instructionID;
+ /* The specifier for the instruction, from the instruction info table */
+ struct InstructionSpecifier* spec;
+
+ /* state for additional bytes, consumed during operand decode. Pattern:
+ consumed___ indicates that the byte was already consumed and does not
+ need to be consumed again */
+
+ /* The ModR/M byte, which contains most register operands and some portion of
+ all memory operands */
+ BOOL consumedModRM;
+ uint8_t modRM;
+
+ /* The SIB byte, used for more complex 32- or 64-bit memory operands */
+ BOOL consumedSIB;
+ uint8_t sib;
+
+ /* The displacement, used for memory operands */
+ BOOL consumedDisplacement;
+ int32_t displacement;
+
+ /* Immediates. There can be two in some cases */
+ uint8_t numImmediatesConsumed;
+ uint8_t numImmediatesTranslated;
+ uint64_t immediates[2];
+
+ /* A register or immediate operand encoded into the opcode */
+ BOOL consumedOpcodeModifier;
+ uint8_t opcodeModifier;
+ Reg opcodeRegister;
+
+ /* Portions of the ModR/M byte */
+
+ /* These fields determine the allowable values for the ModR/M fields, which
+ depend on operand and address widths */
+ EABase eaBaseBase;
+ EABase eaRegBase;
+ Reg regBase;
+
+ /* The Mod and R/M fields can encode a base for an effective address, or a
+ register. These are separated into two fields here */
+ EABase eaBase;
+ EADisplacement eaDisplacement;
+ /* The reg field always encodes a register */
+ Reg reg;
+
+ /* SIB state */
+ SIBIndex sibIndex;
+ uint8_t sibScale;
+ SIBBase sibBase;
+};
+
+/* decodeInstruction - Decode one instruction and store the decoding results in
+ * a buffer provided by the consumer.
+ * @param insn - The buffer to store the instruction in. Allocated by the
+ * consumer.
+ * @param reader - The byteReader_t for the bytes to be read.
+ * @param readerArg - An argument to pass to the reader for storing context
+ * specific to the consumer. May be NULL.
+ * @param logger - The dlog_t to be used in printing status messages from the
+ * disassembler. May be NULL.
+ * @param loggerArg - An argument to pass to the logger for storing context
+ * specific to the logger. May be NULL.
+ * @param startLoc - The address (in the reader's address space) of the first
+ * byte in the instruction.
+ * @param mode - The mode (16-bit, 32-bit, 64-bit) to decode in.
+ * @return - Nonzero if there was an error during decode, 0 otherwise.
+ */
+int decodeInstruction(struct InternalInstruction* insn,
+ byteReader_t reader,
+ void* readerArg,
+ dlog_t logger,
+ void* loggerArg,
+ uint64_t startLoc,
+ DisassemblerMode mode);
+
+/* x86DisassemblerDebug - C-accessible function for printing a message to
+ * debugs()
+ * @param file - The name of the file printing the debug message.
+ * @param line - The line number that printed the debug message.
+ * @param s - The message to print.
+ */
+
+void x86DisassemblerDebug(const char *file,
+ unsigned line,
+ const char *s);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
new file mode 100644
index 0000000..0f33f52
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -0,0 +1,355 @@
+/*===- X86DisassemblerDecoderCommon.h - Disassembler decoder -------*- C -*-==*
+ *
+ * The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file is part of the X86 Disassembler.
+ * It contains common definitions used by both the disassembler and the table
+ * generator.
+ * Documentation for the disassembler can be found in X86Disassembler.h.
+ *
+ *===----------------------------------------------------------------------===*/
+
+/*
+ * This header file provides those definitions that need to be shared between
+ * the decoder and the table generator in a C-friendly manner.
+ */
+
+#ifndef X86DISASSEMBLERDECODERCOMMON_H
+#define X86DISASSEMBLERDECODERCOMMON_H
+
+#include "llvm/System/DataTypes.h"
+
+#define INSTRUCTIONS_SYM x86DisassemblerInstrSpecifiers
+#define CONTEXTS_SYM x86DisassemblerContexts
+#define ONEBYTE_SYM x86DisassemblerOneByteOpcodes
+#define TWOBYTE_SYM x86DisassemblerTwoByteOpcodes
+#define THREEBYTE38_SYM x86DisassemblerThreeByte38Opcodes
+#define THREEBYTE3A_SYM x86DisassemblerThreeByte3AOpcodes
+
+#define INSTRUCTIONS_STR "x86DisassemblerInstrSpecifiers"
+#define CONTEXTS_STR "x86DisassemblerContexts"
+#define ONEBYTE_STR "x86DisassemblerOneByteOpcodes"
+#define TWOBYTE_STR "x86DisassemblerTwoByteOpcodes"
+#define THREEBYTE38_STR "x86DisassemblerThreeByte38Opcodes"
+#define THREEBYTE3A_STR "x86DisassemblerThreeByte3AOpcodes"
+
+/*
+ * Attributes of an instruction that must be known before the opcode can be
+ * processed correctly. Most of these indicate the presence of particular
+ * prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
+ */
+#define ATTRIBUTE_BITS \
+ ENUM_ENTRY(ATTR_NONE, 0x00) \
+ ENUM_ENTRY(ATTR_64BIT, 0x01) \
+ ENUM_ENTRY(ATTR_XS, 0x02) \
+ ENUM_ENTRY(ATTR_XD, 0x04) \
+ ENUM_ENTRY(ATTR_REXW, 0x08) \
+ ENUM_ENTRY(ATTR_OPSIZE, 0x10)
+
+#define ENUM_ENTRY(n, v) n = v,
+enum attributeBits {
+ ATTRIBUTE_BITS
+ ATTR_max
+};
+#undef ENUM_ENTRY
+
+/*
+ * Combinations of the above attributes that are relevant to instruction
+ * decode. Although other combinations are possible, they can be reduced to
+ * these without affecting the ultimately decoded instruction.
+ */
+
+/* Class name Rank Rationale for rank assignment */
+#define INSTRUCTION_CONTEXTS \
+ ENUM_ENTRY(IC, 0, "says nothing about the instruction") \
+ ENUM_ENTRY(IC_64BIT, 1, "says the instruction applies in " \
+ "64-bit mode but no more") \
+ ENUM_ENTRY(IC_OPSIZE, 3, "requires an OPSIZE prefix, so " \
+ "operands change width") \
+ ENUM_ENTRY(IC_XD, 2, "may say something about the opcode " \
+ "but not the operands") \
+ ENUM_ENTRY(IC_XS, 2, "may say something about the opcode " \
+ "but not the operands") \
+ ENUM_ENTRY(IC_64BIT_REXW, 4, "requires a REX.W prefix, so operands "\
+ "change width; overrides IC_OPSIZE") \
+ ENUM_ENTRY(IC_64BIT_OPSIZE, 3, "Just as meaningful as IC_OPSIZE") \
+ ENUM_ENTRY(IC_64BIT_XD, 5, "XD instructions are SSE; REX.W is " \
+ "secondary") \
+ ENUM_ENTRY(IC_64BIT_XS, 5, "Just as meaningful as IC_64BIT_XD") \
+ ENUM_ENTRY(IC_64BIT_REXW_XS, 6, "OPSIZE could mean a different " \
+ "opcode") \
+ ENUM_ENTRY(IC_64BIT_REXW_XD, 6, "Just as meaningful as " \
+ "IC_64BIT_REXW_XS") \
+ ENUM_ENTRY(IC_64BIT_REXW_OPSIZE, 7, "The Dynamic Duo! Prefer over all " \
+ "else because this changes most " \
+ "operands' meaning")
+
+#define ENUM_ENTRY(n, r, d) n,
+typedef enum {
+ INSTRUCTION_CONTEXTS
+ IC_max
+} InstructionContext;
+#undef ENUM_ENTRY
+
+/*
+ * Opcode types, which determine which decode table to use, both in the Intel
+ * manual and also for the decoder.
+ */
+typedef enum {
+ ONEBYTE = 0,
+ TWOBYTE = 1,
+ THREEBYTE_38 = 2,
+ THREEBYTE_3A = 3
+} OpcodeType;
+
+/*
+ * The following structs are used for the hierarchical decode table. After
+ * determining the instruction's class (i.e., which IC_* constant applies to
+ * it), the decoder reads the opcode. Some instructions require specific
+ * values of the ModR/M byte, so the ModR/M byte indexes into the final table.
+ *
+ * If a ModR/M byte is not required, "required" is left unset, and the values
+ * for each instructionID are identical.
+ */
+
+typedef uint16_t InstrUID;
+
+/*
+ * ModRMDecisionType - describes the type of ModR/M decision, allowing the
+ * consumer to determine the number of entries in it.
+ *
+ * MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
+ * instruction is the same.
+ * MODRM_SPLITRM - If the ModR/M byte is between 0x00 and 0xbf, the opcode
+ * corresponds to one instruction; otherwise, it corresponds to
+ * a different instruction.
+ * MODRM_FULL - Potentially, each value of the ModR/M byte could correspond
+ * to a different instruction.
+ */
+
+#define MODRMTYPES \
+ ENUM_ENTRY(MODRM_ONEENTRY) \
+ ENUM_ENTRY(MODRM_SPLITRM) \
+ ENUM_ENTRY(MODRM_FULL)
+
+#define ENUM_ENTRY(n) n,
+typedef enum {
+ MODRMTYPES
+ MODRM_max
+} ModRMDecisionType;
+#undef ENUM_ENTRY
+
+/*
+ * ModRMDecision - Specifies whether a ModR/M byte is needed and (if so) which
+ * instruction each possible value of the ModR/M byte corresponds to. Once
+ * this information is known, we have narrowed down to a single instruction.
+ */
+struct ModRMDecision {
+ uint8_t modrm_type;
+
+ /* The macro below must be defined wherever this file is included. */
+ INSTRUCTION_IDS
+};
+
+/*
+ * OpcodeDecision - Specifies which set of ModR/M->instruction tables to look at
+ * given a particular opcode.
+ */
+struct OpcodeDecision {
+ struct ModRMDecision modRMDecisions[256];
+};
+
+/*
+ * ContextDecision - Specifies which opcode->instruction tables to look at given
+ * a particular context (set of attributes). Since there are many possible
+ * contexts, the decoder first uses CONTEXTS_SYM to determine which context
+ * applies given a specific set of attributes. Hence there are only IC_max
+ * entries in this table, rather than 2^(ATTR_max).
+ */
+struct ContextDecision {
+ struct OpcodeDecision opcodeDecisions[IC_max];
+};
+
+/*
+ * Physical encodings of instruction operands.
+ */
+
+#define ENCODINGS \
+ ENUM_ENTRY(ENCODING_NONE, "") \
+ ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \
+ ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \
+ ENUM_ENTRY(ENCODING_CB, "1-byte code offset (possible new CS value)") \
+ ENUM_ENTRY(ENCODING_CW, "2-byte") \
+ ENUM_ENTRY(ENCODING_CD, "4-byte") \
+ ENUM_ENTRY(ENCODING_CP, "6-byte") \
+ ENUM_ENTRY(ENCODING_CO, "8-byte") \
+ ENUM_ENTRY(ENCODING_CT, "10-byte") \
+ ENUM_ENTRY(ENCODING_IB, "1-byte immediate") \
+ ENUM_ENTRY(ENCODING_IW, "2-byte") \
+ ENUM_ENTRY(ENCODING_ID, "4-byte") \
+ ENUM_ENTRY(ENCODING_IO, "8-byte") \
+ ENUM_ENTRY(ENCODING_RB, "(AL..DIL, R8L..R15L) Register code added to " \
+ "the opcode byte") \
+ ENUM_ENTRY(ENCODING_RW, "(AX..DI, R8W..R15W)") \
+ ENUM_ENTRY(ENCODING_RD, "(EAX..EDI, R8D..R15D)") \
+ ENUM_ENTRY(ENCODING_RO, "(RAX..RDI, R8..R15)") \
+ ENUM_ENTRY(ENCODING_I, "Position on floating-point stack added to the " \
+ "opcode byte") \
+ \
+ ENUM_ENTRY(ENCODING_Iv, "Immediate of operand size") \
+ ENUM_ENTRY(ENCODING_Ia, "Immediate of address size") \
+ ENUM_ENTRY(ENCODING_Rv, "Register code of operand size added to the " \
+ "opcode byte") \
+ ENUM_ENTRY(ENCODING_DUP, "Duplicate of another operand; ID is encoded " \
+ "in type")
+
+#define ENUM_ENTRY(n, d) n,
+ typedef enum {
+ ENCODINGS
+ ENCODING_max
+ } OperandEncoding;
+#undef ENUM_ENTRY
+
+/*
+ * Semantic interpretations of instruction operands.
+ */
+
+#define TYPES \
+ ENUM_ENTRY(TYPE_NONE, "") \
+ ENUM_ENTRY(TYPE_REL8, "1-byte immediate address") \
+ ENUM_ENTRY(TYPE_REL16, "2-byte") \
+ ENUM_ENTRY(TYPE_REL32, "4-byte") \
+ ENUM_ENTRY(TYPE_REL64, "8-byte") \
+ ENUM_ENTRY(TYPE_PTR1616, "2+2-byte segment+offset address") \
+ ENUM_ENTRY(TYPE_PTR1632, "2+4-byte") \
+ ENUM_ENTRY(TYPE_PTR1664, "2+8-byte") \
+ ENUM_ENTRY(TYPE_R8, "1-byte register operand") \
+ ENUM_ENTRY(TYPE_R16, "2-byte") \
+ ENUM_ENTRY(TYPE_R32, "4-byte") \
+ ENUM_ENTRY(TYPE_R64, "8-byte") \
+ ENUM_ENTRY(TYPE_IMM8, "1-byte immediate operand") \
+ ENUM_ENTRY(TYPE_IMM16, "2-byte") \
+ ENUM_ENTRY(TYPE_IMM32, "4-byte") \
+ ENUM_ENTRY(TYPE_IMM64, "8-byte") \
+ ENUM_ENTRY(TYPE_IMM3, "1-byte immediate operand between 0 and 7") \
+ ENUM_ENTRY(TYPE_RM8, "1-byte register or memory operand") \
+ ENUM_ENTRY(TYPE_RM16, "2-byte") \
+ ENUM_ENTRY(TYPE_RM32, "4-byte") \
+ ENUM_ENTRY(TYPE_RM64, "8-byte") \
+ ENUM_ENTRY(TYPE_M, "Memory operand") \
+ ENUM_ENTRY(TYPE_M8, "1-byte") \
+ ENUM_ENTRY(TYPE_M16, "2-byte") \
+ ENUM_ENTRY(TYPE_M32, "4-byte") \
+ ENUM_ENTRY(TYPE_M64, "8-byte") \
+ ENUM_ENTRY(TYPE_LEA, "Effective address") \
+ ENUM_ENTRY(TYPE_M128, "16-byte (SSE/SSE2)") \
+ ENUM_ENTRY(TYPE_M1616, "2+2-byte segment+offset address") \
+ ENUM_ENTRY(TYPE_M1632, "2+4-byte") \
+ ENUM_ENTRY(TYPE_M1664, "2+8-byte") \
+ ENUM_ENTRY(TYPE_M16_32, "2+4-byte two-part memory operand (LIDT, LGDT)") \
+ ENUM_ENTRY(TYPE_M16_16, "2+2-byte (BOUND)") \
+ ENUM_ENTRY(TYPE_M32_32, "4+4-byte (BOUND)") \
+ ENUM_ENTRY(TYPE_M16_64, "2+8-byte (LIDT, LGDT)") \
+ ENUM_ENTRY(TYPE_MOFFS8, "1-byte memory offset (relative to segment " \
+ "base)") \
+ ENUM_ENTRY(TYPE_MOFFS16, "2-byte") \
+ ENUM_ENTRY(TYPE_MOFFS32, "4-byte") \
+ ENUM_ENTRY(TYPE_MOFFS64, "8-byte") \
+ ENUM_ENTRY(TYPE_SREG, "Byte with single bit set: 0 = ES, 1 = CS, " \
+ "2 = SS, 3 = DS, 4 = FS, 5 = GS") \
+ ENUM_ENTRY(TYPE_M32FP, "32-bit IEE754 memory floating-point operand") \
+ ENUM_ENTRY(TYPE_M64FP, "64-bit") \
+ ENUM_ENTRY(TYPE_M80FP, "80-bit extended") \
+ ENUM_ENTRY(TYPE_M16INT, "2-byte memory integer operand for use in " \
+ "floating-point instructions") \
+ ENUM_ENTRY(TYPE_M32INT, "4-byte") \
+ ENUM_ENTRY(TYPE_M64INT, "8-byte") \
+ ENUM_ENTRY(TYPE_ST, "Position on the floating-point stack") \
+ ENUM_ENTRY(TYPE_MM, "MMX register operand") \
+ ENUM_ENTRY(TYPE_MM32, "4-byte MMX register or memory operand") \
+ ENUM_ENTRY(TYPE_MM64, "8-byte") \
+ ENUM_ENTRY(TYPE_XMM, "XMM register operand") \
+ ENUM_ENTRY(TYPE_XMM32, "4-byte XMM register or memory operand") \
+ ENUM_ENTRY(TYPE_XMM64, "8-byte") \
+ ENUM_ENTRY(TYPE_XMM128, "16-byte") \
+ ENUM_ENTRY(TYPE_XMM0, "Implicit use of XMM0") \
+ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \
+ ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \
+ ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand") \
+ \
+ ENUM_ENTRY(TYPE_Mv, "Memory operand of operand size") \
+ ENUM_ENTRY(TYPE_Rv, "Register operand of operand size") \
+ ENUM_ENTRY(TYPE_IMMv, "Immediate operand of operand size") \
+ ENUM_ENTRY(TYPE_RELv, "Immediate address of operand size") \
+ ENUM_ENTRY(TYPE_DUP0, "Duplicate of operand 0") \
+ ENUM_ENTRY(TYPE_DUP1, "operand 1") \
+ ENUM_ENTRY(TYPE_DUP2, "operand 2") \
+ ENUM_ENTRY(TYPE_DUP3, "operand 3") \
+ ENUM_ENTRY(TYPE_DUP4, "operand 4") \
+ ENUM_ENTRY(TYPE_M512, "512-bit FPU/MMX/XMM/MXCSR state")
+
+#define ENUM_ENTRY(n, d) n,
+typedef enum {
+ TYPES
+ TYPE_max
+} OperandType;
+#undef ENUM_ENTRY
+
+/*
+ * OperandSpecifier - The specification for how to extract and interpret one
+ * operand.
+ */
+struct OperandSpecifier {
+ OperandEncoding encoding;
+ OperandType type;
+};
+
+/*
+ * Indicates where the opcode modifier (if any) is to be found. Extended
+ * opcodes with AddRegFrm have the opcode modifier in the ModR/M byte.
+ */
+
+#define MODIFIER_TYPES \
+ ENUM_ENTRY(MODIFIER_NONE) \
+ ENUM_ENTRY(MODIFIER_OPCODE) \
+ ENUM_ENTRY(MODIFIER_MODRM)
+
+#define ENUM_ENTRY(n) n,
+typedef enum {
+ MODIFIER_TYPES
+ MODIFIER_max
+} ModifierType;
+#undef ENUM_ENTRY
+
+#define X86_MAX_OPERANDS 5
+
+/*
+ * The specification for how to extract and interpret a full instruction and
+ * its operands.
+ */
+struct InstructionSpecifier {
+ ModifierType modifierType;
+ uint8_t modifierBase;
+ struct OperandSpecifier operands[X86_MAX_OPERANDS];
+
+ /* The macro below must be defined wherever this file is included. */
+ INSTRUCTION_SPECIFIER_FIELDS
+};
+
+/*
+ * Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode
+ * are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
+ * respectively.
+ */
+typedef enum {
+ MODE_16BIT,
+ MODE_32BIT,
+ MODE_64BIT
+} DisassemblerMode;
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Makefile b/contrib/llvm/lib/Target/X86/Makefile
new file mode 100644
index 0000000..f4ff894
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Makefile
@@ -0,0 +1,25 @@
+##===- lib/Target/X86/Makefile -----------------------------*- Makefile -*-===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMX86CodeGen
+TARGET = X86
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = X86GenRegisterInfo.h.inc X86GenRegisterNames.inc \
+ X86GenRegisterInfo.inc X86GenInstrNames.inc \
+ X86GenInstrInfo.inc X86GenAsmWriter.inc X86GenAsmMatcher.inc \
+ X86GenAsmWriter1.inc X86GenDAGISel.inc \
+ X86GenDisassemblerTables.inc X86GenFastISel.inc \
+ X86GenCallingConv.inc X86GenSubtarget.inc \
+ X86GenEDInfo.inc
+
+DIRS = AsmPrinter AsmParser Disassembler TargetInfo
+
+include $(LEVEL)/Makefile.common
diff --git a/contrib/llvm/lib/Target/X86/README-FPStack.txt b/contrib/llvm/lib/Target/X86/README-FPStack.txt
new file mode 100644
index 0000000..be28e8b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/README-FPStack.txt
@@ -0,0 +1,85 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: FP stack related stuff
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+Some targets (e.g. athlons) prefer freep to fstp ST(0):
+http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
+
+//===---------------------------------------------------------------------===//
+
+This should use fiadd on chips where it is profitable:
+double foo(double P, int *I) { return P+*I; }
+
+We have fiadd patterns now but the followings have the same cost and
+complexity. We need a way to specify the later is more profitable.
+
+def FpADD32m : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
+ [(set RFP:$dst, (fadd RFP:$src1,
+ (extloadf64f32 addr:$src2)))]>;
+ // ST(0) = ST(0) + [mem32]
+
+def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
+ [(set RFP:$dst, (fadd RFP:$src1,
+ (X86fild addr:$src2, i32)))]>;
+ // ST(0) = ST(0) + [mem32int]
+
+//===---------------------------------------------------------------------===//
+
+The FP stackifier needs to be global. Also, it should handle simple permutates
+to reduce number of shuffle instructions, e.g. turning:
+
+fld P -> fld Q
+fld Q fld P
+fxch
+
+or:
+
+fxch -> fucomi
+fucomi jl X
+jg X
+
+Ideas:
+http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
+
+
+//===---------------------------------------------------------------------===//
+
+Add a target specific hook to DAG combiner to handle SINT_TO_FP and
+FP_TO_SINT when the source operand is already in memory.
+
+//===---------------------------------------------------------------------===//
+
+Open code rint,floor,ceil,trunc:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
+
+Opencode the sincos[f] libcall.
+
+//===---------------------------------------------------------------------===//
+
+None of the FPStack instructions are handled in
+X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
+folding spill code into the instructions.
+
+//===---------------------------------------------------------------------===//
+
+Currently the x86 codegen isn't very good at mixing SSE and FPStack
+code:
+
+unsigned int foo(double x) { return x; }
+
+foo:
+ subl $20, %esp
+ movsd 24(%esp), %xmm0
+ movsd %xmm0, 8(%esp)
+ fldl 8(%esp)
+ fisttpll (%esp)
+ movl (%esp), %eax
+ addl $20, %esp
+ ret
+
+This just requires being smarter when custom expanding fptoui.
+
+//===---------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/X86/README-MMX.txt b/contrib/llvm/lib/Target/X86/README-MMX.txt
new file mode 100644
index 0000000..a6c8616
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/README-MMX.txt
@@ -0,0 +1,71 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: MMX-specific stuff.
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+This:
+
+#include <mmintrin.h>
+
+__v2si qux(int A) {
+ return (__v2si){ 0, A };
+}
+
+is compiled into:
+
+_qux:
+ subl $28, %esp
+ movl 32(%esp), %eax
+ movd %eax, %mm0
+ movq %mm0, (%esp)
+ movl (%esp), %eax
+ movl %eax, 20(%esp)
+ movq %mm0, 8(%esp)
+ movl 12(%esp), %eax
+ movl %eax, 16(%esp)
+ movq 16(%esp), %mm0
+ addl $28, %esp
+ ret
+
+Yuck!
+
+GCC gives us:
+
+_qux:
+ subl $12, %esp
+ movl 16(%esp), %eax
+ movl 20(%esp), %edx
+ movl $0, (%eax)
+ movl %edx, 4(%eax)
+ addl $12, %esp
+ ret $4
+
+//===---------------------------------------------------------------------===//
+
+We generate crappy code for this:
+
+__m64 t() {
+ return _mm_cvtsi32_si64(1);
+}
+
+_t:
+ subl $12, %esp
+ movl $1, %eax
+ movd %eax, %mm0
+ movq %mm0, (%esp)
+ movl (%esp), %eax
+ movl 4(%esp), %edx
+ addl $12, %esp
+ ret
+
+The extra stack traffic is covered in the previous entry. But the other reason
+is we are not smart about materializing constants in MMX registers. With -m64
+
+ movl $1, %eax
+ movd %eax, %mm0
+ movd %mm0, %rax
+ ret
+
+We should be using a constantpool load instead:
+ movq LC0(%rip), %rax
diff --git a/contrib/llvm/lib/Target/X86/README-SSE.txt b/contrib/llvm/lib/Target/X86/README-SSE.txt
new file mode 100644
index 0000000..e5f84e8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/README-SSE.txt
@@ -0,0 +1,989 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: SSE-specific stuff.
+//===---------------------------------------------------------------------===//
+
+- Consider eliminating the unaligned SSE load intrinsics, replacing them with
+ unaligned LLVM load instructions.
+
+//===---------------------------------------------------------------------===//
+
+Expand libm rounding functions inline: Significant speedups possible.
+http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
+
+//===---------------------------------------------------------------------===//
+
+When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
+other fast SSE modes.
+
+//===---------------------------------------------------------------------===//
+
+Think about doing i64 math in SSE regs on x86-32.
+
+//===---------------------------------------------------------------------===//
+
+This testcase should have no SSE instructions in it, and only one load from
+a constant pool:
+
+double %test3(bool %B) {
+ %C = select bool %B, double 123.412, double 523.01123123
+ ret double %C
+}
+
+Currently, the select is being lowered, which prevents the dag combiner from
+turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
+
+The pattern isel got this one right.
+
+//===---------------------------------------------------------------------===//
+
+SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction
+like this:
+
+ X += y
+
+and the register allocator decides to spill X, it is cheaper to emit this as:
+
+Y += [xslot]
+store Y -> [xslot]
+
+than as:
+
+tmp = [xslot]
+tmp += y
+store tmp -> [xslot]
+
+..and this uses one fewer register (so this should be done at load folding
+time, not at spiller time). *Note* however that this can only be done
+if Y is dead. Here's a testcase:
+
+@.str_3 = external global [15 x i8]
+declare void @printf(i32, ...)
+define void @main() {
+build_tree.exit:
+ br label %no_exit.i7
+
+no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit
+ %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ],
+ [ %tmp.34.i18, %no_exit.i7 ]
+ %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ],
+ [ %tmp.28.i16, %no_exit.i7 ]
+ %tmp.28.i16 = fadd double %tmp.0.0.0.i10, 0.000000e+00
+ %tmp.34.i18 = fadd double %tmp.0.1.0.i9, 0.000000e+00
+ br i1 false, label %Compute_Tree.exit23, label %no_exit.i7
+
+Compute_Tree.exit23: ; preds = %no_exit.i7
+ tail call void (i32, ...)* @printf( i32 0 )
+ store double %tmp.34.i18, double* null
+ ret void
+}
+
+We currently emit:
+
+.BBmain_1:
+ xorpd %XMM1, %XMM1
+ addsd %XMM0, %XMM1
+*** movsd %XMM2, QWORD PTR [%ESP + 8]
+*** addsd %XMM2, %XMM1
+*** movsd QWORD PTR [%ESP + 8], %XMM2
+ jmp .BBmain_1 # no_exit.i7
+
+This is a bugpoint reduced testcase, which is why the testcase doesn't make
+much sense (e.g. its an infinite loop). :)
+
+//===---------------------------------------------------------------------===//
+
+SSE should implement 'select_cc' using 'emulated conditional moves' that use
+pcmp/pand/pandn/por to do a selection instead of a conditional branch:
+
+double %X(double %Y, double %Z, double %A, double %B) {
+ %C = setlt double %A, %B
+ %z = fadd double %Z, 0.0 ;; select operand is not a load
+ %D = select bool %C, double %Y, double %z
+ ret double %D
+}
+
+We currently emit:
+
+_X:
+ subl $12, %esp
+ xorpd %xmm0, %xmm0
+ addsd 24(%esp), %xmm0
+ movsd 32(%esp), %xmm1
+ movsd 16(%esp), %xmm2
+ ucomisd 40(%esp), %xmm1
+ jb LBB_X_2
+LBB_X_1:
+ movsd %xmm0, %xmm2
+LBB_X_2:
+ movsd %xmm2, (%esp)
+ fldl (%esp)
+ addl $12, %esp
+ ret
+
+//===---------------------------------------------------------------------===//
+
+It's not clear whether we should use pxor or xorps / xorpd to clear XMM
+registers. The choice may depend on subtarget information. We should do some
+more experiments on different x86 machines.
+
+//===---------------------------------------------------------------------===//
+
+Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
+feasible.
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+ if (copysign(1.0, x) == copysign(1.0, y))
+into:
+ if (x^y & mask)
+when using SSE.
+
+//===---------------------------------------------------------------------===//
+
+Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
+of a v4sf value.
+
+//===---------------------------------------------------------------------===//
+
+Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
+Perhaps use pxor / xorp* to clear a XMM register first?
+
+//===---------------------------------------------------------------------===//
+
+How to decide when to use the "floating point version" of logical ops? Here are
+some code fragments:
+
+ movaps LCPI5_5, %xmm2
+ divps %xmm1, %xmm2
+ mulps %xmm2, %xmm3
+ mulps 8656(%ecx), %xmm3
+ addps 8672(%ecx), %xmm3
+ andps LCPI5_6, %xmm2
+ andps LCPI5_1, %xmm3
+ por %xmm2, %xmm3
+ movdqa %xmm3, (%edi)
+
+ movaps LCPI5_5, %xmm1
+ divps %xmm0, %xmm1
+ mulps %xmm1, %xmm3
+ mulps 8656(%ecx), %xmm3
+ addps 8672(%ecx), %xmm3
+ andps LCPI5_6, %xmm1
+ andps LCPI5_1, %xmm3
+ orps %xmm1, %xmm3
+ movaps %xmm3, 112(%esp)
+ movaps %xmm3, (%ebx)
+
+Due to some minor source change, the later case ended up using orps and movaps
+instead of por and movdqa. Does it matter?
+
+//===---------------------------------------------------------------------===//
+
+X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
+to choose between movaps, movapd, and movdqa based on types of source and
+destination?
+
+How about andps, andpd, and pand? Do we really care about the type of the packed
+elements? If not, why not always use the "ps" variants which are likely to be
+shorter.
+
+//===---------------------------------------------------------------------===//
+
+External test Nurbs exposed some problems. Look for
+__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
+emits:
+
+ movaps (%edx), %xmm2 #59.21
+ movaps (%edx), %xmm5 #60.21
+ movaps (%edx), %xmm4 #61.21
+ movaps (%edx), %xmm3 #62.21
+ movl 40(%ecx), %ebp #69.49
+ shufps $0, %xmm2, %xmm5 #60.21
+ movl 100(%esp), %ebx #69.20
+ movl (%ebx), %edi #69.20
+ imull %ebp, %edi #69.49
+ addl (%eax), %edi #70.33
+ shufps $85, %xmm2, %xmm4 #61.21
+ shufps $170, %xmm2, %xmm3 #62.21
+ shufps $255, %xmm2, %xmm2 #63.21
+ lea (%ebp,%ebp,2), %ebx #69.49
+ negl %ebx #69.49
+ lea -3(%edi,%ebx), %ebx #70.33
+ shll $4, %ebx #68.37
+ addl 32(%ecx), %ebx #68.37
+ testb $15, %bl #91.13
+ jne L_B1.24 # Prob 5% #91.13
+
+This is the llvm code after instruction scheduling:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+ %reg1078 = MOV32ri -3
+ %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
+ %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
+ %reg1080 = IMUL32rr %reg1079, %reg1037
+ %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
+ %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
+ %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
+ %reg1082 = SHL32ri %reg1038, 4
+ %reg1039 = ADD32rr %reg1036, %reg1082
+ %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
+ %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
+ %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
+ %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
+ %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
+ %reg1040 = MOV32rr %reg1039
+ %reg1084 = AND32ri8 %reg1039, 15
+ CMP32ri8 %reg1084, 0
+ JE mbb<cond_next204,0xa914d30>
+
+Still ok. After register allocation:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+ %EAX = MOV32ri -3
+ %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
+ ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
+ %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
+ %EDX = MOV32rm %EDX, 1, %NOREG, 40
+ IMUL32rr %EAX<def&use>, %EDX
+ %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
+ %ESI = MOV32rm %ESI, 1, %NOREG, 0
+ MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
+ %EAX = LEA32r %ESI, 1, %EAX, -3
+ %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
+ %ESI = MOV32rm %ESI, 1, %NOREG, 32
+ %EDI = MOV32rr %EAX
+ SHL32ri %EDI<def&use>, 4
+ ADD32rr %EDI<def&use>, %ESI
+ %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
+ %XMM1 = MOVAPSrr %XMM0
+ SHUFPSrr %XMM1<def&use>, %XMM1, 170
+ %XMM2 = MOVAPSrr %XMM0
+ SHUFPSrr %XMM2<def&use>, %XMM2, 0
+ %XMM3 = MOVAPSrr %XMM0
+ SHUFPSrr %XMM3<def&use>, %XMM3, 255
+ SHUFPSrr %XMM0<def&use>, %XMM0, 85
+ %EBX = MOV32rr %EDI
+ AND32ri8 %EBX<def&use>, 15
+ CMP32ri8 %EBX, 0
+ JE mbb<cond_next204,0xa914d30>
+
+This looks really bad. The problem is shufps is a destructive opcode. Since it
+appears as operand two in more than one shufps ops. It resulted in a number of
+copies. Note icc also suffers from the same problem. Either the instruction
+selector should select pshufd or The register allocator can made the two-address
+to three-address transformation.
+
+It also exposes some other problems. See MOV32ri -3 and the spills.
+
+//===---------------------------------------------------------------------===//
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
+
+LLVM is producing bad code.
+
+LBB_main_4: # cond_true44
+ addps %xmm1, %xmm2
+ subps %xmm3, %xmm2
+ movaps (%ecx), %xmm4
+ movaps %xmm2, %xmm1
+ addps %xmm4, %xmm1
+ addl $16, %ecx
+ incl %edx
+ cmpl $262144, %edx
+ movaps %xmm3, %xmm2
+ movaps %xmm4, %xmm3
+ jne LBB_main_4 # cond_true44
+
+There are two problems. 1) No need to two loop induction variables. We can
+compare against 262144 * 16. 2) Known register coalescer issue. We should
+be able eliminate one of the movaps:
+
+ addps %xmm2, %xmm1 <=== Commute!
+ subps %xmm3, %xmm1
+ movaps (%ecx), %xmm4
+ movaps %xmm1, %xmm1 <=== Eliminate!
+ addps %xmm4, %xmm1
+ addl $16, %ecx
+ incl %edx
+ cmpl $262144, %edx
+ movaps %xmm3, %xmm2
+ movaps %xmm4, %xmm3
+ jne LBB_main_4 # cond_true44
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+__m128 test(float a) {
+ return _mm_set_ps(0.0, 0.0, 0.0, a*a);
+}
+
+This compiles into:
+
+movss 4(%esp), %xmm1
+mulss %xmm1, %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Because mulss doesn't modify the top 3 elements, the top elements of
+xmm1 are already zero'd. We could compile this to:
+
+movss 4(%esp), %xmm0
+mulss %xmm0, %xmm0
+ret
+
+//===---------------------------------------------------------------------===//
+
+Here's a sick and twisted idea. Consider code like this:
+
+__m128 test(__m128 a) {
+ float b = *(float*)&A;
+ ...
+ return _mm_set_ps(0.0, 0.0, 0.0, b);
+}
+
+This might compile to this code:
+
+movaps c(%esp), %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Now consider if the ... code caused xmm1 to get spilled. This might produce
+this code:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+xorps %xmm0, %xmm0
+movaps c2(%esp), %xmm1
+movss %xmm1, %xmm0
+ret
+
+However, since the reload is only used by these instructions, we could
+"fold" it into the uses, producing something like this:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+movss c2(%esp), %xmm0
+ret
+
+... saving two instructions.
+
+The basic idea is that a reload from a spill slot, can, if only one 4-byte
+chunk is used, bring in 3 zeros the one element instead of 4 elements.
+This can be used to simplify a variety of shuffle operations, where the
+elements are fixed zeros.
+
+//===---------------------------------------------------------------------===//
+
+__m128d test1( __m128d A, __m128d B) {
+ return _mm_shuffle_pd(A, B, 0x3);
+}
+
+compiles to
+
+shufpd $3, %xmm1, %xmm0
+
+Perhaps it's better to use unpckhpd instead?
+
+unpckhpd %xmm1, %xmm0
+
+Don't know if unpckhpd is faster. But it is shorter.
+
+//===---------------------------------------------------------------------===//
+
+This code generates ugly code, probably due to costs being off or something:
+
+define void @test(float* %P, <4 x float>* %P2 ) {
+ %xFloat0.688 = load float* %P
+ %tmp = load <4 x float>* %P2
+ %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
+ store <4 x float> %inFloat3.713, <4 x float>* %P2
+ ret void
+}
+
+Generates:
+
+_test:
+ movl 8(%esp), %eax
+ movaps (%eax), %xmm0
+ pxor %xmm1, %xmm1
+ movaps %xmm0, %xmm2
+ shufps $50, %xmm1, %xmm2
+ shufps $132, %xmm2, %xmm0
+ movaps %xmm0, (%eax)
+ ret
+
+Would it be better to generate:
+
+_test:
+ movl 8(%esp), %ecx
+ movaps (%ecx), %xmm0
+ xor %eax, %eax
+ pinsrw $6, %eax, %xmm0
+ pinsrw $7, %eax, %xmm0
+ movaps %xmm0, (%ecx)
+ ret
+
+?
+
+//===---------------------------------------------------------------------===//
+
+Some useful information in the Apple Altivec / SSE Migration Guide:
+
+http://developer.apple.com/documentation/Performance/Conceptual/
+Accelerate_sse_migration/index.html
+
+e.g. SSE select using and, andnot, or. Various SSE compare translations.
+
+//===---------------------------------------------------------------------===//
+
+Add hooks to commute some CMPP operations.
+
+//===---------------------------------------------------------------------===//
+
+Apply the same transformation that merged four float into a single 128-bit load
+to loads from constant pool.
+
+//===---------------------------------------------------------------------===//
+
+Floating point max / min are commutable when -enable-unsafe-fp-path is
+specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
+nodes which are selected to max / min instructions that are marked commutable.
+
+//===---------------------------------------------------------------------===//
+
+We should materialize vector constants like "all ones" and "signbit" with
+code like:
+
+ cmpeqps xmm1, xmm1 ; xmm1 = all-ones
+
+and:
+ cmpeqps xmm1, xmm1 ; xmm1 = all-ones
+ psrlq xmm1, 31 ; xmm1 = all 100000000000...
+
+instead of using a load from the constant pool. The later is important for
+ABS/NEG/copysign etc.
+
+//===---------------------------------------------------------------------===//
+
+These functions:
+
+#include <xmmintrin.h>
+__m128i a;
+void x(unsigned short n) {
+ a = _mm_slli_epi32 (a, n);
+}
+void y(unsigned n) {
+ a = _mm_slli_epi32 (a, n);
+}
+
+compile to ( -O3 -static -fomit-frame-pointer):
+_x:
+ movzwl 4(%esp), %eax
+ movd %eax, %xmm0
+ movaps _a, %xmm1
+ pslld %xmm0, %xmm1
+ movaps %xmm1, _a
+ ret
+_y:
+ movd 4(%esp), %xmm0
+ movaps _a, %xmm1
+ pslld %xmm0, %xmm1
+ movaps %xmm1, _a
+ ret
+
+"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems
+like movd would be sufficient in both cases as the value is already zero
+extended in the 32-bit stack slot IIRC. For signed short, it should also be
+save, as a really-signed value would be undefined for pslld.
+
+
+//===---------------------------------------------------------------------===//
+
+#include <math.h>
+int t1(double d) { return signbit(d); }
+
+This currently compiles to:
+ subl $12, %esp
+ movsd 16(%esp), %xmm0
+ movsd %xmm0, (%esp)
+ movl 4(%esp), %eax
+ shrl $31, %eax
+ addl $12, %esp
+ ret
+
+We should use movmskp{s|d} instead.
+
+//===---------------------------------------------------------------------===//
+
+CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
+(aligned) vector load. This functionality has a couple of problems.
+
+1. The code to infer alignment from loads of globals is in the X86 backend,
+ not the dag combiner. This is because dagcombine2 needs to be able to see
+ through the X86ISD::Wrapper node, which DAGCombine can't really do.
+2. The code for turning 4 x load into a single vector load is target
+ independent and should be moved to the dag combiner.
+3. The code for turning 4 x load into a vector load can only handle a direct
+ load from a global or a direct load from the stack. It should be generalized
+ to handle any load from P, P+4, P+8, P+12, where P can be anything.
+4. The alignment inference code cannot handle loads from globals in non-static
+ mode because it doesn't look through the extra dyld stub load. If you try
+ vec_align.ll without -relocation-model=static, you'll see what I mean.
+
+//===---------------------------------------------------------------------===//
+
+We should lower store(fneg(load p), q) into an integer load+xor+store, which
+eliminates a constant pool load. For example, consider:
+
+define i64 @ccosf(float %z.0, float %z.1) nounwind readonly {
+entry:
+ %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1]
+ %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
+ ret i64 %tmp20
+}
+
+This currently compiles to:
+
+LCPI1_0: # <4 x float>
+ .long 2147483648 # float -0
+ .long 2147483648 # float -0
+ .long 2147483648 # float -0
+ .long 2147483648 # float -0
+_ccosf:
+ subl $12, %esp
+ movss 16(%esp), %xmm0
+ movss %xmm0, 4(%esp)
+ movss 20(%esp), %xmm0
+ xorps LCPI1_0, %xmm0
+ movss %xmm0, (%esp)
+ call L_ccoshf$stub
+ addl $12, %esp
+ ret
+
+Note the load into xmm0, then xor (to negate), then store. In PIC mode,
+this code computes the pic base and does two loads to do the constant pool
+load, so the improvement is much bigger.
+
+The tricky part about this xform is that the argument load/store isn't exposed
+until post-legalize, and at that point, the fneg has been custom expanded into
+an X86 fxor. This means that we need to handle this case in the x86 backend
+instead of in target independent code.
+
+//===---------------------------------------------------------------------===//
+
+Non-SSE4 insert into 16 x i8 is atrociously bad.
+
+//===---------------------------------------------------------------------===//
+
+<2 x i64> extract is substantially worse than <2 x f64>, even if the destination
+is memory.
+
+//===---------------------------------------------------------------------===//
+
+SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
+sitting between the truncate and the extract.
+
+//===---------------------------------------------------------------------===//
+
+INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
+any number of 0.0 simultaneously. Currently we only use it for simple
+insertions.
+
+See comments in LowerINSERT_VECTOR_ELT_SSE4.
+
+//===---------------------------------------------------------------------===//
+
+On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
+Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
+legal, it'll just take a few extra patterns written in the .td file.
+
+Note: this is not a code quality issue; the custom lowered code happens to be
+right, but we shouldn't have to custom lower anything. This is probably related
+to <2 x i64> ops being so bad.
+
+//===---------------------------------------------------------------------===//
+
+'select' on vectors and scalars could be a whole lot better. We currently
+lower them to conditional branches. On x86-64 for example, we compile this:
+
+double test(double a, double b, double c, double d) { return a<b ? c : d; }
+
+to:
+
+_test:
+ ucomisd %xmm0, %xmm1
+ ja LBB1_2 # entry
+LBB1_1: # entry
+ movapd %xmm3, %xmm2
+LBB1_2: # entry
+ movapd %xmm2, %xmm0
+ ret
+
+instead of:
+
+_test:
+ cmpltsd %xmm1, %xmm0
+ andpd %xmm0, %xmm2
+ andnpd %xmm3, %xmm0
+ orpd %xmm2, %xmm0
+ ret
+
+For unpredictable branches, the later is much more efficient. This should
+just be a matter of having scalar sse map to SELECT_CC and custom expanding
+or iseling it.
+
+//===---------------------------------------------------------------------===//
+
+LLVM currently generates stack realignment code, when it is not necessary
+needed. The problem is that we need to know about stack alignment too early,
+before RA runs.
+
+At that point we don't know, whether there will be vector spill, or not.
+Stack realignment logic is overly conservative here, but otherwise we can
+produce unaligned loads/stores.
+
+Fixing this will require some huge RA changes.
+
+Testcase:
+#include <emmintrin.h>
+
+typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
+
+static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
+- 22725, - 12873};;
+
+vSInt16 madd(vSInt16 b)
+{
+ return _mm_madd_epi16(a, b);
+}
+
+Generated code (x86-32, linux):
+madd:
+ pushl %ebp
+ movl %esp, %ebp
+ andl $-16, %esp
+ movaps .LCPI1_0, %xmm1
+ pmaddwd %xmm1, %xmm0
+ movl %ebp, %esp
+ popl %ebp
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+#include <emmintrin.h>
+__m128 foo2 (float x) {
+ return _mm_set_ps (0, 0, x, 0);
+}
+
+In x86-32 mode, we generate this spiffy code:
+
+_foo2:
+ movss 4(%esp), %xmm0
+ pshufd $81, %xmm0, %xmm0
+ ret
+
+in x86-64 mode, we generate this code, which could be better:
+
+_foo2:
+ xorps %xmm1, %xmm1
+ movss %xmm0, %xmm1
+ pshufd $81, %xmm1, %xmm0
+ ret
+
+In sse4 mode, we could use insertps to make both better.
+
+Here's another testcase that could use insertps [mem]:
+
+#include <xmmintrin.h>
+extern float x2, x3;
+__m128 foo1 (float x1, float x4) {
+ return _mm_set_ps (x2, x1, x3, x4);
+}
+
+gcc mainline compiles it to:
+
+foo1:
+ insertps $0x10, x2(%rip), %xmm0
+ insertps $0x10, x3(%rip), %xmm1
+ movaps %xmm1, %xmm2
+ movlhps %xmm0, %xmm2
+ movaps %xmm2, %xmm0
+ ret
+
+//===---------------------------------------------------------------------===//
+
+We compile vector multiply-by-constant into poor code:
+
+define <4 x i32> @f(<4 x i32> %i) nounwind {
+ %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
+ ret <4 x i32> %A
+}
+
+On targets without SSE4.1, this compiles into:
+
+LCPI1_0: ## <4 x i32>
+ .long 10
+ .long 10
+ .long 10
+ .long 10
+ .text
+ .align 4,0x90
+ .globl _f
+_f:
+ pshufd $3, %xmm0, %xmm1
+ movd %xmm1, %eax
+ imull LCPI1_0+12, %eax
+ movd %eax, %xmm1
+ pshufd $1, %xmm0, %xmm2
+ movd %xmm2, %eax
+ imull LCPI1_0+4, %eax
+ movd %eax, %xmm2
+ punpckldq %xmm1, %xmm2
+ movd %xmm0, %eax
+ imull LCPI1_0, %eax
+ movd %eax, %xmm1
+ movhlps %xmm0, %xmm0
+ movd %xmm0, %eax
+ imull LCPI1_0+8, %eax
+ movd %eax, %xmm0
+ punpckldq %xmm0, %xmm1
+ movaps %xmm1, %xmm0
+ punpckldq %xmm2, %xmm0
+ ret
+
+It would be better to synthesize integer vector multiplication by constants
+using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
+simple cases such as multiplication by powers of two would be better as
+vector shifts than as multiplications.
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+__m128i
+foo2 (char x)
+{
+ return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
+}
+
+into:
+ movl $1, %eax
+ xorps %xmm0, %xmm0
+ pinsrw $2, %eax, %xmm0
+ movzbl 4(%esp), %eax
+ pinsrw $3, %eax, %xmm0
+ movl $256, %eax
+ pinsrw $7, %eax, %xmm0
+ ret
+
+
+gcc-4.2:
+ subl $12, %esp
+ movzbl 16(%esp), %eax
+ movdqa LC0, %xmm0
+ pinsrw $3, %eax, %xmm0
+ addl $12, %esp
+ ret
+ .const
+ .align 4
+LC0:
+ .word 0
+ .word 0
+ .word 1
+ .word 0
+ .word 0
+ .word 0
+ .word 0
+ .word 256
+
+With SSE4, it should be
+ movdqa .LC0(%rip), %xmm0
+ pinsrb $6, %edi, %xmm0
+
+//===---------------------------------------------------------------------===//
+
+We should transform a shuffle of two vectors of constants into a single vector
+of constants. Also, insertelement of a constant into a vector of constants
+should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
+
+We compiled it to something horrible:
+
+ .align 4
+LCPI1_1: ## float
+ .long 1065353216 ## float 1
+ .const
+
+ .align 4
+LCPI1_0: ## <4 x float>
+ .space 4
+ .long 1065353216 ## float 1
+ .space 4
+ .long 1065353216 ## float 1
+ .text
+ .align 4,0x90
+ .globl _t
+_t:
+ xorps %xmm0, %xmm0
+ movhps LCPI1_0, %xmm0
+ movss LCPI1_1, %xmm1
+ movaps %xmm0, %xmm2
+ shufps $2, %xmm1, %xmm2
+ shufps $132, %xmm2, %xmm0
+ movaps %xmm0, 0
+
+//===---------------------------------------------------------------------===//
+rdar://5907648
+
+This function:
+
+float foo(unsigned char x) {
+ return x;
+}
+
+compiles to (x86-32):
+
+define float @foo(i8 zeroext %x) nounwind {
+ %tmp12 = uitofp i8 %x to float ; <float> [#uses=1]
+ ret float %tmp12
+}
+
+compiles to:
+
+_foo:
+ subl $4, %esp
+ movzbl 8(%esp), %eax
+ cvtsi2ss %eax, %xmm0
+ movss %xmm0, (%esp)
+ flds (%esp)
+ addl $4, %esp
+ ret
+
+We should be able to use:
+ cvtsi2ss 8($esp), %xmm0
+since we know the stack slot is already zext'd.
+
+//===---------------------------------------------------------------------===//
+
+Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
+when code size is critical. movlps is slower than movsd on core2 but it's one
+byte shorter.
+
+//===---------------------------------------------------------------------===//
+
+We should use a dynamic programming based approach to tell when using FPStack
+operations is cheaper than SSE. SciMark montecarlo contains code like this
+for example:
+
+double MonteCarlo_num_flops(int Num_samples) {
+ return ((double) Num_samples)* 4.0;
+}
+
+In fpstack mode, this compiles into:
+
+LCPI1_0:
+ .long 1082130432 ## float 4.000000e+00
+_MonteCarlo_num_flops:
+ subl $4, %esp
+ movl 8(%esp), %eax
+ movl %eax, (%esp)
+ fildl (%esp)
+ fmuls LCPI1_0
+ addl $4, %esp
+ ret
+
+in SSE mode, it compiles into significantly slower code:
+
+_MonteCarlo_num_flops:
+ subl $12, %esp
+ cvtsi2sd 16(%esp), %xmm0
+ mulsd LCPI1_0, %xmm0
+ movsd %xmm0, (%esp)
+ fldl (%esp)
+ addl $12, %esp
+ ret
+
+There are also other cases in scimark where using fpstack is better, it is
+cheaper to do fld1 than load from a constant pool for example, so
+"load, add 1.0, store" is better done in the fp stack, etc.
+
+//===---------------------------------------------------------------------===//
+
+The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
+"cmpsd". For example, this code:
+
+double d1(double x) { return x == x ? x : x + x; }
+
+Compiles into:
+
+_d1:
+ ucomisd %xmm0, %xmm0
+ jnp LBB1_2
+ addsd %xmm0, %xmm0
+ ret
+LBB1_2:
+ ret
+
+Also, the 'ret's should be shared. This is PR6032.
+
+//===---------------------------------------------------------------------===//
+
+These should compile into the same code (PR6214): Perhaps instcombine should
+canonicalize the former into the later?
+
+define float @foo(float %x) nounwind {
+ %t = bitcast float %x to i32
+ %s = and i32 %t, 2147483647
+ %d = bitcast i32 %s to float
+ ret float %d
+}
+
+declare float @fabsf(float %n)
+define float @bar(float %x) nounwind {
+ %d = call float @fabsf(float %x)
+ ret float %d
+}
+
+//===---------------------------------------------------------------------===//
+
+This IR (from PR6194):
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+%0 = type { double, double }
+%struct.float3 = type { float, float, float }
+
+define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
+entry:
+ %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1]
+ %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1]
+ %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1]
+ %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1]
+ %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1]
+ %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1]
+ %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
+ store float %tmp12, float* %tmp5
+ ret void
+}
+
+Compiles to:
+
+_test: ## @test
+ movd %xmm0, %rax
+ shrq $32, %rax
+ movl %eax, 4(%rdi)
+ ret
+
+This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
+doing a shuffle from v[1] to v[0] then a float store.
+
+//===---------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/X86/README-UNIMPLEMENTED.txt b/contrib/llvm/lib/Target/X86/README-UNIMPLEMENTED.txt
new file mode 100644
index 0000000..c26c75a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/README-UNIMPLEMENTED.txt
@@ -0,0 +1,14 @@
+//===---------------------------------------------------------------------===//
+// Testcases that crash the X86 backend because they aren't implemented
+//===---------------------------------------------------------------------===//
+
+These are cases we know the X86 backend doesn't handle. Patches are welcome
+and appreciated, because no one has signed up to implemented these yet.
+Implementing these would allow elimination of the corresponding intrinsics,
+which would be great.
+
+1) vector shifts
+2) vector comparisons
+3) vector fp<->int conversions: PR2683, PR2684, PR2685, PR2686, PR2688
+4) bitcasts from vectors to scalars: PR2804
+5) llvm.atomic.cmp.swap.i128.p0i128: PR3462
diff --git a/contrib/llvm/lib/Target/X86/README-X86-64.txt b/contrib/llvm/lib/Target/X86/README-X86-64.txt
new file mode 100644
index 0000000..e8f7c5d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/README-X86-64.txt
@@ -0,0 +1,300 @@
+//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
+
+Implement different PIC models? Right now we only support Mac OS X with small
+PIC code model.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+extern void xx(void);
+void bar(void) {
+ xx();
+}
+
+gcc compiles to:
+
+.globl _bar
+_bar:
+ jmp _xx
+
+We need to do the tailcall optimization as well.
+
+//===---------------------------------------------------------------------===//
+
+AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
+multiplication by a constant. How much of it applies to Intel's X86-64
+implementation? There are definite trade-offs to consider: latency vs. register
+pressure vs. code size.
+
+//===---------------------------------------------------------------------===//
+
+Are we better off using branches instead of cmove to implement FP to
+unsigned i64?
+
+_conv:
+ ucomiss LC0(%rip), %xmm0
+ cvttss2siq %xmm0, %rdx
+ jb L3
+ subss LC0(%rip), %xmm0
+ movabsq $-9223372036854775808, %rax
+ cvttss2siq %xmm0, %rdx
+ xorq %rax, %rdx
+L3:
+ movq %rdx, %rax
+ ret
+
+instead of
+
+_conv:
+ movss LCPI1_0(%rip), %xmm1
+ cvttss2siq %xmm0, %rcx
+ movaps %xmm0, %xmm2
+ subss %xmm1, %xmm2
+ cvttss2siq %xmm2, %rax
+ movabsq $-9223372036854775808, %rdx
+ xorq %rdx, %rax
+ ucomiss %xmm1, %xmm0
+ cmovb %rcx, %rax
+ ret
+
+Seems like the jb branch has high likelyhood of being taken. It would have
+saved a few instructions.
+
+//===---------------------------------------------------------------------===//
+
+Poor codegen:
+
+int X[2];
+int b;
+void test(void) {
+ memset(X, b, 2*sizeof(X[0]));
+}
+
+llc:
+ movq _b@GOTPCREL(%rip), %rax
+ movzbq (%rax), %rax
+ movq %rax, %rcx
+ shlq $8, %rcx
+ orq %rax, %rcx
+ movq %rcx, %rax
+ shlq $16, %rax
+ orq %rcx, %rax
+ movq %rax, %rcx
+ shlq $32, %rcx
+ movq _X@GOTPCREL(%rip), %rdx
+ orq %rax, %rcx
+ movq %rcx, (%rdx)
+ ret
+
+gcc:
+ movq _b@GOTPCREL(%rip), %rax
+ movabsq $72340172838076673, %rdx
+ movzbq (%rax), %rax
+ imulq %rdx, %rax
+ movq _X@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Vararg function prologue can be further optimized. Currently all XMM registers
+are stored into register save area. Most of them can be eliminated since the
+upper bound of the number of XMM registers used are passed in %al. gcc produces
+something like the following:
+
+ movzbl %al, %edx
+ leaq 0(,%rdx,4), %rax
+ leaq 4+L2(%rip), %rdx
+ leaq 239(%rsp), %rax
+ jmp *%rdx
+ movaps %xmm7, -15(%rax)
+ movaps %xmm6, -31(%rax)
+ movaps %xmm5, -47(%rax)
+ movaps %xmm4, -63(%rax)
+ movaps %xmm3, -79(%rax)
+ movaps %xmm2, -95(%rax)
+ movaps %xmm1, -111(%rax)
+ movaps %xmm0, -127(%rax)
+L2:
+
+It jumps over the movaps that do not need to be stored. Hard to see this being
+significant as it added 5 instruciton (including a indirect branch) to avoid
+executing 0 to 8 stores in the function prologue.
+
+Perhaps we can optimize for the common case where no XMM registers are used for
+parameter passing. i.e. is %al == 0 jump over all stores. Or in the case of a
+leaf function where we can determine that no XMM input parameter is need, avoid
+emitting the stores at all.
+
+//===---------------------------------------------------------------------===//
+
+AMD64 has a complex calling convention for aggregate passing by value:
+
+1. If the size of an object is larger than two eightbytes, or in C++, is a non-
+ POD structure or union type, or contains unaligned fields, it has class
+ MEMORY.
+2. Both eightbytes get initialized to class NO_CLASS.
+3. Each field of an object is classified recursively so that always two fields
+ are considered. The resulting class is calculated according to the classes
+ of the fields in the eightbyte:
+ (a) If both classes are equal, this is the resulting class.
+ (b) If one of the classes is NO_CLASS, the resulting class is the other
+ class.
+ (c) If one of the classes is MEMORY, the result is the MEMORY class.
+ (d) If one of the classes is INTEGER, the result is the INTEGER.
+ (e) If one of the classes is X87, X87UP, COMPLEX_X87 class, MEMORY is used as
+ class.
+ (f) Otherwise class SSE is used.
+4. Then a post merger cleanup is done:
+ (a) If one of the classes is MEMORY, the whole argument is passed in memory.
+ (b) If SSEUP is not preceeded by SSE, it is converted to SSE.
+
+Currently llvm frontend does not handle this correctly.
+
+Problem 1:
+ typedef struct { int i; double d; } QuadWordS;
+It is currently passed in two i64 integer registers. However, gcc compiled
+callee expects the second element 'd' to be passed in XMM0.
+
+Problem 2:
+ typedef struct { int32_t i; float j; double d; } QuadWordS;
+The size of the first two fields == i64 so they will be combined and passed in
+a integer register RDI. The third field is still passed in XMM0.
+
+Problem 3:
+ typedef struct { int64_t i; int8_t j; int64_t d; } S;
+ void test(S s)
+The size of this aggregate is greater than two i64 so it should be passed in
+memory. Currently llvm breaks this down and passed it in three integer
+registers.
+
+Problem 4:
+Taking problem 3 one step ahead where a function expects a aggregate value
+in memory followed by more parameter(s) passed in register(s).
+ void test(S s, int b)
+
+LLVM IR does not allow parameter passing by aggregates, therefore it must break
+the aggregates value (in problem 3 and 4) into a number of scalar values:
+ void %test(long %s.i, byte %s.j, long %s.d);
+
+However, if the backend were to lower this code literally it would pass the 3
+values in integer registers. To force it be passed in memory, the frontend
+should change the function signiture to:
+ void %test(long %undef1, long %undef2, long %undef3, long %undef4,
+ long %undef5, long %undef6,
+ long %s.i, byte %s.j, long %s.d);
+And the callee would look something like this:
+ call void %test( undef, undef, undef, undef, undef, undef,
+ %tmp.s.i, %tmp.s.j, %tmp.s.d );
+The first 6 undef parameters would exhaust the 6 integer registers used for
+parameter passing. The following three integer values would then be forced into
+memory.
+
+For problem 4, the parameter 'd' would be moved to the front of the parameter
+list so it will be passed in register:
+ void %test(int %d,
+ long %undef1, long %undef2, long %undef3, long %undef4,
+ long %undef5, long %undef6,
+ long %s.i, byte %s.j, long %s.d);
+
+//===---------------------------------------------------------------------===//
+
+Right now the asm printer assumes GlobalAddress are accessed via RIP relative
+addressing. Therefore, it is not possible to generate this:
+ movabsq $__ZTV10polynomialIdE+16, %rax
+
+That is ok for now since we currently only support small model. So the above
+is selected as
+ leaq __ZTV10polynomialIdE+16(%rip), %rax
+
+This is probably slightly slower but is much shorter than movabsq. However, if
+we were to support medium or larger code models, we need to use the movabs
+instruction. We should probably introduce something like AbsoluteAddress to
+distinguish it from GlobalAddress so the asm printer and JIT code emitter can
+do the right thing.
+
+//===---------------------------------------------------------------------===//
+
+It's not possible to reference AH, BH, CH, and DH registers in an instruction
+requiring REX prefix. However, divb and mulb both produce results in AH. If isel
+emits a CopyFromReg which gets turned into a movb and that can be allocated a
+r8b - r15b.
+
+To get around this, isel emits a CopyFromReg from AX and then right shift it
+down by 8 and truncate it. It's not pretty but it works. We need some register
+allocation magic to make the hack go away (e.g. putting additional constraints
+on the result of the movb).
+
+//===---------------------------------------------------------------------===//
+
+The x86-64 ABI for hidden-argument struct returns requires that the
+incoming value of %rdi be copied into %rax by the callee upon return.
+
+The idea is that it saves callers from having to remember this value,
+which would often require a callee-saved register. Callees usually
+need to keep this value live for most of their body anyway, so it
+doesn't add a significant burden on them.
+
+We currently implement this in codegen, however this is suboptimal
+because it means that it would be quite awkward to implement the
+optimization for callers.
+
+A better implementation would be to relax the LLVM IR rules for sret
+arguments to allow a function with an sret argument to have a non-void
+return type, and to have the front-end to set up the sret argument value
+as the return value of the function. The front-end could more easily
+emit uses of the returned struct value to be in terms of the function's
+lowered return value, and it would free non-C frontends from a
+complication only required by a C-based ABI.
+
+//===---------------------------------------------------------------------===//
+
+We get a redundant zero extension for code like this:
+
+int mask[1000];
+int foo(unsigned x) {
+ if (x < 10)
+ x = x * 45;
+ else
+ x = x * 78;
+ return mask[x];
+}
+
+_foo:
+LBB1_0: ## entry
+ cmpl $9, %edi
+ jbe LBB1_3 ## bb
+LBB1_1: ## bb1
+ imull $78, %edi, %eax
+LBB1_2: ## bb2
+ movl %eax, %eax <----
+ movq _mask@GOTPCREL(%rip), %rcx
+ movl (%rcx,%rax,4), %eax
+ ret
+LBB1_3: ## bb
+ imull $45, %edi, %eax
+ jmp LBB1_2 ## bb2
+
+Before regalloc, we have:
+
+ %reg1025<def> = IMUL32rri8 %reg1024, 45, %EFLAGS<imp-def>
+ JMP mbb<bb2,0x203afb0>
+ Successors according to CFG: 0x203afb0 (#3)
+
+bb1: 0x203af60, LLVM BB @0x1e02310, ID#2:
+ Predecessors according to CFG: 0x203aec0 (#0)
+ %reg1026<def> = IMUL32rri8 %reg1024, 78, %EFLAGS<imp-def>
+ Successors according to CFG: 0x203afb0 (#3)
+
+bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3:
+ Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2)
+ %reg1027<def> = PHI %reg1025, mbb<bb,0x203af10>,
+ %reg1026, mbb<bb1,0x203af60>
+ %reg1029<def> = MOVZX64rr32 %reg1027
+
+so we'd have to know that IMUL32rri8 leaves the high word zero extended and to
+be able to recognize the zero extend. This could also presumably be implemented
+if we have whole-function selectiondags.
+
+//===---------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/X86/README.txt b/contrib/llvm/lib/Target/X86/README.txt
new file mode 100644
index 0000000..d4545a6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/README.txt
@@ -0,0 +1,1931 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend.
+//===---------------------------------------------------------------------===//
+
+We should add support for the "movbe" instruction, which does a byte-swapping
+copy (3-addr bswap + memory support?) This is available on Atom processors.
+
+//===---------------------------------------------------------------------===//
+
+CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86
+backend knows how to three-addressify this shift, but it appears the register
+allocator isn't even asking it to do so in this case. We should investigate
+why this isn't happening, it could have significant impact on other important
+cases for X86 as well.
+
+//===---------------------------------------------------------------------===//
+
+This should be one DIV/IDIV instruction, not a libcall:
+
+unsigned test(unsigned long long X, unsigned Y) {
+ return X/Y;
+}
+
+This can be done trivially with a custom legalizer. What about overflow
+though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
+
+//===---------------------------------------------------------------------===//
+
+Improvements to the multiply -> shift/add algorithm:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
+
+//===---------------------------------------------------------------------===//
+
+Improve code like this (occurs fairly frequently, e.g. in LLVM):
+long long foo(int x) { return 1LL << x; }
+
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
+
+Another useful one would be ~0ULL >> X and ~0ULL << X.
+
+One better solution for 1LL << x is:
+ xorl %eax, %eax
+ xorl %edx, %edx
+ testb $32, %cl
+ sete %al
+ setne %dl
+ sall %cl, %eax
+ sall %cl, %edx
+
+But that requires good 8-bit subreg support.
+
+Also, this might be better. It's an extra shift, but it's one instruction
+shorter, and doesn't stress 8-bit subreg support.
+(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
+but without the unnecessary and.)
+ movl %ecx, %eax
+ shrl $5, %eax
+ movl %eax, %edx
+ xorl $1, %edx
+ sall %cl, %eax
+ sall %cl. %edx
+
+64-bit shifts (in general) expand to really bad code. Instead of using
+cmovs, we should expand to a conditional branch like GCC produces.
+
+//===---------------------------------------------------------------------===//
+
+Compile this:
+_Bool f(_Bool a) { return a!=1; }
+
+into:
+ movzbl %dil, %eax
+ xorl $1, %eax
+ ret
+
+(Although note that this isn't a legal way to express the code that llvm-gcc
+currently generates for that function.)
+
+//===---------------------------------------------------------------------===//
+
+Some isel ideas:
+
+1. Dynamic programming based approach when compile time if not an
+ issue.
+2. Code duplication (addressing mode) during isel.
+3. Other ideas from "Register-Sensitive Selection, Duplication, and
+ Sequencing of Instructions".
+4. Scheduling for reduced register pressure. E.g. "Minimum Register
+ Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
+ and other related papers.
+ http://citeseer.ist.psu.edu/govindarajan01minimum.html
+
+//===---------------------------------------------------------------------===//
+
+Should we promote i16 to i32 to avoid partial register update stalls?
+
+//===---------------------------------------------------------------------===//
+
+Leave any_extend as pseudo instruction and hint to register
+allocator. Delay codegen until post register allocation.
+Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
+the coalescer how to deal with it though.
+
+//===---------------------------------------------------------------------===//
+
+It appears icc use push for parameter passing. Need to investigate.
+
+//===---------------------------------------------------------------------===//
+
+Only use inc/neg/not instructions on processors where they are faster than
+add/sub/xor. They are slower on the P4 due to only updating some processor
+flags.
+
+//===---------------------------------------------------------------------===//
+
+The instruction selector sometimes misses folding a load into a compare. The
+pattern is written as (cmp reg, (load p)). Because the compare isn't
+commutative, it is not matched with the load on both sides. The dag combiner
+should be made smart enough to cannonicalize the load into the RHS of a compare
+when it can invert the result of the compare for free.
+
+//===---------------------------------------------------------------------===//
+
+In many cases, LLVM generates code like this:
+
+_test:
+ movl 8(%esp), %eax
+ cmpl %eax, 4(%esp)
+ setl %al
+ movzbl %al, %eax
+ ret
+
+on some processors (which ones?), it is more efficient to do this:
+
+_test:
+ movl 8(%esp), %ebx
+ xor %eax, %eax
+ cmpl %ebx, 4(%esp)
+ setl %al
+ ret
+
+Doing this correctly is tricky though, as the xor clobbers the flags.
+
+//===---------------------------------------------------------------------===//
+
+We should generate bts/btr/etc instructions on targets where they are cheap or
+when codesize is important. e.g., for:
+
+void setbit(int *target, int bit) {
+ *target |= (1 << bit);
+}
+void clearbit(int *target, int bit) {
+ *target &= ~(1 << bit);
+}
+
+//===---------------------------------------------------------------------===//
+
+Instead of the following for memset char*, 1, 10:
+
+ movl $16843009, 4(%edx)
+ movl $16843009, (%edx)
+ movw $257, 8(%edx)
+
+It might be better to generate
+
+ movl $16843009, %eax
+ movl %eax, 4(%edx)
+ movl %eax, (%edx)
+ movw al, 8(%edx)
+
+when we can spare a register. It reduces code size.
+
+//===---------------------------------------------------------------------===//
+
+Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
+get this:
+
+define i32 @test1(i32 %X) {
+ %Y = sdiv i32 %X, 8
+ ret i32 %Y
+}
+
+_test1:
+ movl 4(%esp), %eax
+ movl %eax, %ecx
+ sarl $31, %ecx
+ shrl $29, %ecx
+ addl %ecx, %eax
+ sarl $3, %eax
+ ret
+
+GCC knows several different ways to codegen it, one of which is this:
+
+_test1:
+ movl 4(%esp), %eax
+ cmpl $-1, %eax
+ leal 7(%eax), %ecx
+ cmovle %ecx, %eax
+ sarl $3, %eax
+ ret
+
+which is probably slower, but it's interesting at least :)
+
+//===---------------------------------------------------------------------===//
+
+We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
+We should leave these as libcalls for everything over a much lower threshold,
+since libc is hand tuned for medium and large mem ops (avoiding RFO for large
+stores, TLB preheating, etc)
+
+//===---------------------------------------------------------------------===//
+
+Optimize this into something reasonable:
+ x * copysign(1.0, y) * copysign(1.0, z)
+
+//===---------------------------------------------------------------------===//
+
+Optimize copysign(x, *y) to use an integer load from y.
+
+//===---------------------------------------------------------------------===//
+
+The following tests perform worse with LSR:
+
+lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
+
+//===---------------------------------------------------------------------===//
+
+Adding to the list of cmp / test poor codegen issues:
+
+int test(__m128 *A, __m128 *B) {
+ if (_mm_comige_ss(*A, *B))
+ return 3;
+ else
+ return 4;
+}
+
+_test:
+ movl 8(%esp), %eax
+ movaps (%eax), %xmm0
+ movl 4(%esp), %eax
+ movaps (%eax), %xmm1
+ comiss %xmm0, %xmm1
+ setae %al
+ movzbl %al, %ecx
+ movl $3, %eax
+ movl $4, %edx
+ cmpl $0, %ecx
+ cmove %edx, %eax
+ ret
+
+Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
+are a number of issues. 1) We are introducing a setcc between the result of the
+intrisic call and select. 2) The intrinsic is expected to produce a i32 value
+so a any extend (which becomes a zero extend) is added.
+
+We probably need some kind of target DAG combine hook to fix this.
+
+//===---------------------------------------------------------------------===//
+
+We generate significantly worse code for this than GCC:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
+http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
+
+There is also one case we do worse on PPC.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+int test(int a)
+{
+ return a * 3;
+}
+
+We currently emits
+ imull $3, 4(%esp), %eax
+
+Perhaps this is what we really should generate is? Is imull three or four
+cycles? Note: ICC generates this:
+ movl 4(%esp), %eax
+ leal (%eax,%eax,2), %eax
+
+The current instruction priority is based on pattern complexity. The former is
+more "complex" because it folds a load so the latter will not be emitted.
+
+Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
+should always try to match LEA first since the LEA matching code does some
+estimate to determine whether the match is profitable.
+
+However, if we care more about code size, then imull is better. It's two bytes
+shorter than movl + leal.
+
+On a Pentium M, both variants have the same characteristics with regard
+to throughput; however, the multiplication has a latency of four cycles, as
+opposed to two cycles for the movl+lea variant.
+
+//===---------------------------------------------------------------------===//
+
+__builtin_ffs codegen is messy.
+
+int ffs_(unsigned X) { return __builtin_ffs(X); }
+
+llvm produces:
+ffs_:
+ movl 4(%esp), %ecx
+ bsfl %ecx, %eax
+ movl $32, %edx
+ cmove %edx, %eax
+ incl %eax
+ xorl %edx, %edx
+ testl %ecx, %ecx
+ cmove %edx, %eax
+ ret
+
+vs gcc:
+
+_ffs_:
+ movl $-1, %edx
+ bsfl 4(%esp), %eax
+ cmove %edx, %eax
+ addl $1, %eax
+ ret
+
+Another example of __builtin_ffs (use predsimplify to eliminate a select):
+
+int foo (unsigned long j) {
+ if (j)
+ return __builtin_ffs (j) - 1;
+ else
+ return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+It appears gcc place string data with linkonce linkage in
+.section __TEXT,__const_coal,coalesced instead of
+.section __DATA,__const_coal,coalesced.
+Take a look at darwin.h, there are other Darwin assembler directives that we
+do not make use of.
+
+//===---------------------------------------------------------------------===//
+
+define i32 @foo(i32* %a, i32 %t) {
+entry:
+ br label %cond_true
+
+cond_true: ; preds = %cond_true, %entry
+ %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3]
+ %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1]
+ %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1]
+ %tmp3 = load i32* %tmp2 ; <i32> [#uses=1]
+ %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1]
+ %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2]
+ %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2]
+ %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1]
+ br i1 %tmp, label %bb12, label %cond_true
+
+bb12: ; preds = %cond_true
+ ret i32 %tmp7
+}
+is pessimized by -loop-reduce and -indvars
+
+//===---------------------------------------------------------------------===//
+
+u32 to float conversion improvement:
+
+float uint32_2_float( unsigned u ) {
+ float fl = (int) (u & 0xffff);
+ float fh = (int) (u >> 16);
+ fh *= 0x1.0p16f;
+ return fh + fl;
+}
+
+00000000 subl $0x04,%esp
+00000003 movl 0x08(%esp,1),%eax
+00000007 movl %eax,%ecx
+00000009 shrl $0x10,%ecx
+0000000c cvtsi2ss %ecx,%xmm0
+00000010 andl $0x0000ffff,%eax
+00000015 cvtsi2ss %eax,%xmm1
+00000019 mulss 0x00000078,%xmm0
+00000021 addss %xmm1,%xmm0
+00000025 movss %xmm0,(%esp,1)
+0000002a flds (%esp,1)
+0000002d addl $0x04,%esp
+00000030 ret
+
+//===---------------------------------------------------------------------===//
+
+When using fastcc abi, align stack slot of argument of type double on 8 byte
+boundary to improve performance.
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+
+int f(int a, int b) {
+ if (a == 4 || a == 6)
+ b++;
+ return b;
+}
+
+
+as:
+
+or eax, 2
+cmp eax, 6
+jz label
+
+//===---------------------------------------------------------------------===//
+
+GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
+simplifications for integer "x cmp y ? a : b". For example, instead of:
+
+int G;
+void f(int X, int Y) {
+ G = X < 0 ? 14 : 13;
+}
+
+compiling to:
+
+_f:
+ movl $14, %eax
+ movl $13, %ecx
+ movl 4(%esp), %edx
+ testl %edx, %edx
+ cmovl %eax, %ecx
+ movl %ecx, _G
+ ret
+
+it could be:
+_f:
+ movl 4(%esp), %eax
+ sarl $31, %eax
+ notl %eax
+ addl $14, %eax
+ movl %eax, _G
+ ret
+
+etc.
+
+Another is:
+int usesbb(unsigned int a, unsigned int b) {
+ return (a < b ? -1 : 0);
+}
+to:
+_usesbb:
+ movl 8(%esp), %eax
+ cmpl %eax, 4(%esp)
+ sbbl %eax, %eax
+ ret
+
+instead of:
+_usesbb:
+ xorl %eax, %eax
+ movl 8(%esp), %ecx
+ cmpl %ecx, 4(%esp)
+ movl $4294967295, %ecx
+ cmovb %ecx, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Consider the expansion of:
+
+define i32 @test3(i32 %X) {
+ %tmp1 = urem i32 %X, 255
+ ret i32 %tmp1
+}
+
+Currently it compiles to:
+
+...
+ movl $2155905153, %ecx
+ movl 8(%esp), %esi
+ movl %esi, %eax
+ mull %ecx
+...
+
+This could be "reassociated" into:
+
+ movl $2155905153, %eax
+ movl 8(%esp), %ecx
+ mull %ecx
+
+to avoid the copy. In fact, the existing two-address stuff would do this
+except that mul isn't a commutative 2-addr instruction. I guess this has
+to be done at isel time based on the #uses to mul?
+
+//===---------------------------------------------------------------------===//
+
+Make sure the instruction which starts a loop does not cross a cacheline
+boundary. This requires knowning the exact length of each machine instruction.
+That is somewhat complicated, but doable. Example 256.bzip2:
+
+In the new trace, the hot loop has an instruction which crosses a cacheline
+boundary. In addition to potential cache misses, this can't help decoding as I
+imagine there has to be some kind of complicated decoder reset and realignment
+to grab the bytes from the next cacheline.
+
+532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
+942 942 0x3d03 movl %dh, (1809(%esp, %esi)
+937 937 0x3d0a incl %esi
+3 3 0x3d0b cmpb %bl, %dl
+27 27 0x3d0d jnz 0x000062db <main+11707>
+
+//===---------------------------------------------------------------------===//
+
+In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
+
+//===---------------------------------------------------------------------===//
+
+This could be a single 16-bit load.
+
+int f(char *p) {
+ if ((p[0] == 1) & (p[1] == 2)) return 1;
+ return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+We should inline lrintf and probably other libc functions.
+
+//===---------------------------------------------------------------------===//
+
+Use the FLAGS values from arithmetic instructions more. For example, compile:
+
+int add_zf(int *x, int y, int a, int b) {
+ if ((*x += y) == 0)
+ return a;
+ else
+ return b;
+}
+
+to:
+ addl %esi, (%rdi)
+ movl %edx, %eax
+ cmovne %ecx, %eax
+ ret
+instead of:
+
+_add_zf:
+ addl (%rdi), %esi
+ movl %esi, (%rdi)
+ testl %esi, %esi
+ cmove %edx, %ecx
+ movl %ecx, %eax
+ ret
+
+As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
+without a test instruction.
+
+//===---------------------------------------------------------------------===//
+
+These two functions have identical effects:
+
+unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
+unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
+
+We currently compile them to:
+
+_f:
+ movl 4(%esp), %eax
+ movl %eax, %ecx
+ incl %ecx
+ movl 8(%esp), %edx
+ cmpl %edx, %ecx
+ jne LBB1_2 #UnifiedReturnBlock
+LBB1_1: #cond_true
+ addl $2, %eax
+ ret
+LBB1_2: #UnifiedReturnBlock
+ movl %ecx, %eax
+ ret
+_f2:
+ movl 4(%esp), %eax
+ movl %eax, %ecx
+ incl %ecx
+ cmpl 8(%esp), %ecx
+ sete %cl
+ movzbl %cl, %ecx
+ leal 1(%ecx,%eax), %eax
+ ret
+
+both of which are inferior to GCC's:
+
+_f:
+ movl 4(%esp), %edx
+ leal 1(%edx), %eax
+ addl $2, %edx
+ cmpl 8(%esp), %eax
+ cmove %edx, %eax
+ ret
+_f2:
+ movl 4(%esp), %eax
+ addl $1, %eax
+ xorl %edx, %edx
+ cmpl 8(%esp), %eax
+ sete %dl
+ addl %edx, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void test(int X) {
+ if (X) abort();
+}
+
+is currently compiled to:
+
+_test:
+ subl $12, %esp
+ cmpl $0, 16(%esp)
+ jne LBB1_1
+ addl $12, %esp
+ ret
+LBB1_1:
+ call L_abort$stub
+
+It would be better to produce:
+
+_test:
+ subl $12, %esp
+ cmpl $0, 16(%esp)
+ jne L_abort$stub
+ addl $12, %esp
+ ret
+
+This can be applied to any no-return function call that takes no arguments etc.
+Alternatively, the stack save/restore logic could be shrink-wrapped, producing
+something like this:
+
+_test:
+ cmpl $0, 4(%esp)
+ jne LBB1_1
+ ret
+LBB1_1:
+ subl $12, %esp
+ call L_abort$stub
+
+Both are useful in different situations. Finally, it could be shrink-wrapped
+and tail called, like this:
+
+_test:
+ cmpl $0, 4(%esp)
+ jne LBB1_1
+ ret
+LBB1_1:
+ pop %eax # realign stack.
+ call L_abort$stub
+
+Though this probably isn't worth it.
+
+//===---------------------------------------------------------------------===//
+
+Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
+a neg instead of a sub instruction. Consider:
+
+int test(char X) { return 7-X; }
+
+we currently produce:
+_test:
+ movl $7, %eax
+ movsbl 4(%esp), %ecx
+ subl %ecx, %eax
+ ret
+
+We would use one fewer register if codegen'd as:
+
+ movsbl 4(%esp), %eax
+ neg %eax
+ add $7, %eax
+ ret
+
+Note that this isn't beneficial if the load can be folded into the sub. In
+this case, we want a sub:
+
+int test(int X) { return 7-X; }
+_test:
+ movl $7, %eax
+ subl 4(%esp), %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Leaf functions that require one 4-byte spill slot have a prolog like this:
+
+_foo:
+ pushl %esi
+ subl $4, %esp
+...
+and an epilog like this:
+ addl $4, %esp
+ popl %esi
+ ret
+
+It would be smaller, and potentially faster, to push eax on entry and to
+pop into a dummy register instead of using addl/subl of esp. Just don't pop
+into any return registers :)
+
+//===---------------------------------------------------------------------===//
+
+The X86 backend should fold (branch (or (setcc, setcc))) into multiple
+branches. We generate really poor code for:
+
+double testf(double a) {
+ return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
+}
+
+For example, the entry BB is:
+
+_testf:
+ subl $20, %esp
+ pxor %xmm0, %xmm0
+ movsd 24(%esp), %xmm1
+ ucomisd %xmm0, %xmm1
+ setnp %al
+ sete %cl
+ testb %cl, %al
+ jne LBB1_5 # UnifiedReturnBlock
+LBB1_1: # cond_true
+
+
+it would be better to replace the last four instructions with:
+
+ jp LBB1_1
+ je LBB1_5
+LBB1_1:
+
+We also codegen the inner ?: into a diamond:
+
+ cvtss2sd LCPI1_0(%rip), %xmm2
+ cvtss2sd LCPI1_1(%rip), %xmm3
+ ucomisd %xmm1, %xmm0
+ ja LBB1_3 # cond_true
+LBB1_2: # cond_true
+ movapd %xmm3, %xmm2
+LBB1_3: # cond_true
+ movapd %xmm2, %xmm0
+ ret
+
+We should sink the load into xmm3 into the LBB1_2 block. This should
+be pretty easy, and will nuke all the copies.
+
+//===---------------------------------------------------------------------===//
+
+This:
+ #include <algorithm>
+ inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
+ { return std::make_pair(a + b, a + b < a); }
+ bool no_overflow(unsigned a, unsigned b)
+ { return !full_add(a, b).second; }
+
+Should compile to:
+
+
+ _Z11no_overflowjj:
+ addl %edi, %esi
+ setae %al
+ ret
+
+FIXME: That code looks wrong; bool return is normally defined as zext.
+
+on x86-64, not:
+
+__Z11no_overflowjj:
+ addl %edi, %esi
+ cmpl %edi, %esi
+ setae %al
+ movzbl %al, %eax
+ ret
+
+
+//===---------------------------------------------------------------------===//
+
+The following code:
+
+bb114.preheader: ; preds = %cond_next94
+ %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
+ %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
+ %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
+ %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
+ %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
+ %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
+ %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
+ %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
+ %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
+ %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
+ %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
+ br label %bb114
+
+produces:
+
+LBB3_5: # bb114.preheader
+ movswl -68(%ebp), %eax
+ movl $32, %ecx
+ movl %ecx, -80(%ebp)
+ subl %eax, -80(%ebp)
+ movswl -52(%ebp), %eax
+ movl %ecx, -84(%ebp)
+ subl %eax, -84(%ebp)
+ movswl -70(%ebp), %eax
+ movl %ecx, -88(%ebp)
+ subl %eax, -88(%ebp)
+ movswl -50(%ebp), %eax
+ subl %eax, %ecx
+ movl %ecx, -76(%ebp)
+ movswl -42(%ebp), %eax
+ movl %eax, -92(%ebp)
+ movswl -66(%ebp), %eax
+ movl %eax, -96(%ebp)
+ movw $0, -98(%ebp)
+
+This appears to be bad because the RA is not folding the store to the stack
+slot into the movl. The above instructions could be:
+ movl $32, -80(%ebp)
+...
+ movl $32, -84(%ebp)
+...
+This seems like a cross between remat and spill folding.
+
+This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
+change, so we could simply subtract %eax from %ecx first and then use %ecx (or
+vice-versa).
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+ %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
+ br i1 %tmp659, label %cond_true662, label %cond_next715
+
+produces this:
+
+ testw %cx, %cx
+ movswl %cx, %esi
+ jns LBB4_109 # cond_next715
+
+Shark tells us that using %cx in the testw instruction is sub-optimal. It
+suggests using the 32-bit register (which is what ICC uses).
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+void compare (long long foo) {
+ if (foo < 4294967297LL)
+ abort();
+}
+
+to:
+
+compare:
+ subl $4, %esp
+ cmpl $0, 8(%esp)
+ setne %al
+ movzbw %al, %ax
+ cmpl $1, 12(%esp)
+ setg %cl
+ movzbw %cl, %cx
+ cmove %ax, %cx
+ testb $1, %cl
+ jne .LBB1_2 # UnifiedReturnBlock
+.LBB1_1: # ifthen
+ call abort
+.LBB1_2: # UnifiedReturnBlock
+ addl $4, %esp
+ ret
+
+(also really horrible code on ppc). This is due to the expand code for 64-bit
+compares. GCC produces multiple branches, which is much nicer:
+
+compare:
+ subl $12, %esp
+ movl 20(%esp), %edx
+ movl 16(%esp), %eax
+ decl %edx
+ jle .L7
+.L5:
+ addl $12, %esp
+ ret
+ .p2align 4,,7
+.L7:
+ jl .L4
+ cmpl $0, %eax
+ .p2align 4,,8
+ ja .L5
+.L4:
+ .p2align 4,,9
+ call abort
+
+//===---------------------------------------------------------------------===//
+
+Tail call optimization improvements: Tail call optimization currently
+pushes all arguments on the top of the stack (their normal place for
+non-tail call optimized calls) that source from the callers arguments
+or that source from a virtual register (also possibly sourcing from
+callers arguments).
+This is done to prevent overwriting of parameters (see example
+below) that might be used later.
+
+example:
+
+int callee(int32, int64);
+int caller(int32 arg1, int32 arg2) {
+ int64 local = arg2 * 2;
+ return callee(arg2, (int64)local);
+}
+
+[arg1] [!arg2 no longer valid since we moved local onto it]
+[arg2] -> [(int64)
+[RETADDR] local ]
+
+Moving arg1 onto the stack slot of callee function would overwrite
+arg2 of the caller.
+
+Possible optimizations:
+
+
+ - Analyse the actual parameters of the callee to see which would
+ overwrite a caller parameter which is used by the callee and only
+ push them onto the top of the stack.
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg1,arg2);
+ }
+
+ Here we don't need to write any variables to the top of the stack
+ since they don't overwrite each other.
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg2,arg1);
+ }
+
+ Here we need to push the arguments because they overwrite each
+ other.
+
+//===---------------------------------------------------------------------===//
+
+main ()
+{
+ int i = 0;
+ unsigned long int z = 0;
+
+ do {
+ z -= 0x00004000;
+ i++;
+ if (i > 0x00040000)
+ abort ();
+ } while (z > 0);
+ exit (0);
+}
+
+gcc compiles this to:
+
+_main:
+ subl $28, %esp
+ xorl %eax, %eax
+ jmp L2
+L3:
+ cmpl $262144, %eax
+ je L10
+L2:
+ addl $1, %eax
+ cmpl $262145, %eax
+ jne L3
+ call L_abort$stub
+L10:
+ movl $0, (%esp)
+ call L_exit$stub
+
+llvm:
+
+_main:
+ subl $12, %esp
+ movl $1, %eax
+ movl $16384, %ecx
+LBB1_1: # bb
+ cmpl $262145, %eax
+ jge LBB1_4 # cond_true
+LBB1_2: # cond_next
+ incl %eax
+ addl $4294950912, %ecx
+ cmpl $16384, %ecx
+ jne LBB1_1 # bb
+LBB1_3: # bb11
+ xorl %eax, %eax
+ addl $12, %esp
+ ret
+LBB1_4: # cond_true
+ call L_abort$stub
+
+1. LSR should rewrite the first cmp with induction variable %ecx.
+2. DAG combiner should fold
+ leal 1(%eax), %edx
+ cmpl $262145, %edx
+ =>
+ cmpl $262144, %eax
+
+//===---------------------------------------------------------------------===//
+
+define i64 @test(double %X) {
+ %Y = fptosi double %X to i64
+ ret i64 %Y
+}
+
+compiles to:
+
+_test:
+ subl $20, %esp
+ movsd 24(%esp), %xmm0
+ movsd %xmm0, 8(%esp)
+ fldl 8(%esp)
+ fisttpll (%esp)
+ movl 4(%esp), %edx
+ movl (%esp), %eax
+ addl $20, %esp
+ #FP_REG_KILL
+ ret
+
+This should just fldl directly from the input stack slot.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+int foo (int x) { return (x & 65535) | 255; }
+
+Should compile into:
+
+_foo:
+ movzwl 4(%esp), %eax
+ orl $255, %eax
+ ret
+
+instead of:
+_foo:
+ movl $255, %eax
+ orl 4(%esp), %eax
+ andl $65535, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+We're codegen'ing multiply of long longs inefficiently:
+
+unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
+ return arg1 * arg2;
+}
+
+We compile to (fomit-frame-pointer):
+
+_LLM:
+ pushl %esi
+ movl 8(%esp), %ecx
+ movl 16(%esp), %esi
+ movl %esi, %eax
+ mull %ecx
+ imull 12(%esp), %esi
+ addl %edx, %esi
+ imull 20(%esp), %ecx
+ movl %esi, %edx
+ addl %ecx, %edx
+ popl %esi
+ ret
+
+This looks like a scheduling deficiency and lack of remat of the load from
+the argument area. ICC apparently produces:
+
+ movl 8(%esp), %ecx
+ imull 12(%esp), %ecx
+ movl 16(%esp), %eax
+ imull 4(%esp), %eax
+ addl %eax, %ecx
+ movl 4(%esp), %eax
+ mull 12(%esp)
+ addl %ecx, %edx
+ ret
+
+Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
+
+//===---------------------------------------------------------------------===//
+
+We can fold a store into "zeroing a reg". Instead of:
+
+xorl %eax, %eax
+movl %eax, 124(%esp)
+
+we should get:
+
+movl $0, 124(%esp)
+
+if the flags of the xor are dead.
+
+Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should
+be folded into: shl [mem], 1
+
+//===---------------------------------------------------------------------===//
+
+This testcase misses a read/modify/write opportunity (from PR1425):
+
+void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
+ int i;
+ for(i=0; i<width; i++)
+ b1[i] += (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We compile it down to:
+
+LBB1_2: # bb
+ movl (%esi,%edi,4), %ebx
+ addl (%ecx,%edi,4), %ebx
+ addl (%edx,%edi,4), %ebx
+ movl %ebx, (%ecx,%edi,4)
+ incl %edi
+ cmpl %eax, %edi
+ jne LBB1_2 # bb
+
+the inner loop should add to the memory location (%ecx,%edi,4), saving
+a mov. Something like:
+
+ movl (%esi,%edi,4), %ebx
+ addl (%edx,%edi,4), %ebx
+ addl %ebx, (%ecx,%edi,4)
+
+Here is another interesting example:
+
+void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
+ int i;
+ for(i=0; i<width; i++)
+ b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
+
+LBB9_2: # bb
+ movl (%ecx,%edi,4), %ebx
+ subl (%esi,%edi,4), %ebx
+ subl (%edx,%edi,4), %ebx
+ movl %ebx, (%ecx,%edi,4)
+ incl %edi
+ cmpl %eax, %edi
+ jne LBB9_2 # bb
+
+Additionally, LSR should rewrite the exit condition of these loops to use
+a stride-4 IV, would would allow all the scales in the loop to go away.
+This would result in smaller code and more efficient microops.
+
+//===---------------------------------------------------------------------===//
+
+In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
+or and instruction, for example:
+
+ xorpd LCPI1_0, %xmm2
+
+However, if xmm2 gets spilled, we end up with really ugly code like this:
+
+ movsd (%esp), %xmm0
+ xorpd LCPI1_0, %xmm0
+ movsd %xmm0, (%esp)
+
+Since we 'know' that this is a 'neg', we can actually "fold" the spill into
+the neg/abs instruction, turning it into an *integer* operation, like this:
+
+ xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31)
+
+you could also use xorb, but xorl is less likely to lead to a partial register
+stall. Here is a contrived testcase:
+
+double a, b, c;
+void test(double *P) {
+ double X = *P;
+ a = X;
+ bar();
+ X = -X;
+ b = X;
+ bar();
+ c = X;
+}
+
+//===---------------------------------------------------------------------===//
+
+handling llvm.memory.barrier on pre SSE2 cpus
+
+should generate:
+lock ; mov %esp, %esp
+
+//===---------------------------------------------------------------------===//
+
+The generated code on x86 for checking for signed overflow on a multiply the
+obvious way is much longer than it needs to be.
+
+int x(int a, int b) {
+ long long prod = (long long)a*b;
+ return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
+}
+
+See PR2053 for more details.
+
+//===---------------------------------------------------------------------===//
+
+We should investigate using cdq/ctld (effect: edx = sar eax, 31)
+more aggressively; it should cost the same as a move+shift on any modern
+processor, but it's a lot shorter. Downside is that it puts more
+pressure on register allocation because it has fixed operands.
+
+Example:
+int abs(int x) {return x < 0 ? -x : x;}
+
+gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
+abs:
+ movl 4(%esp), %eax
+ cltd
+ xorl %edx, %eax
+ subl %edx, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+int test(unsigned long a, unsigned long b) { return -(a < b); }
+
+We currently compile this to:
+
+define i32 @test(i32 %a, i32 %b) nounwind {
+ %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]
+ %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
+ %tmp5 = sub i32 0, %tmp34 ; <i32> [#uses=1]
+ ret i32 %tmp5
+}
+
+and
+
+_test:
+ movl 8(%esp), %eax
+ cmpl %eax, 4(%esp)
+ setb %al
+ movzbl %al, %eax
+ negl %eax
+ ret
+
+Several deficiencies here. First, we should instcombine zext+neg into sext:
+
+define i32 @test2(i32 %a, i32 %b) nounwind {
+ %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]
+ %tmp34 = sext i1 %tmp3 to i32 ; <i32> [#uses=1]
+ ret i32 %tmp34
+}
+
+However, before we can do that, we have to fix the bad codegen that we get for
+sext from bool:
+
+_test2:
+ movl 8(%esp), %eax
+ cmpl %eax, 4(%esp)
+ setb %al
+ movzbl %al, %eax
+ shll $31, %eax
+ sarl $31, %eax
+ ret
+
+This code should be at least as good as the code above. Once this is fixed, we
+can optimize this specific case even more to:
+
+ movl 8(%esp), %eax
+ xorl %ecx, %ecx
+ cmpl %eax, 4(%esp)
+ sbbl %ecx, %ecx
+
+//===---------------------------------------------------------------------===//
+
+Take the following code (from
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
+
+extern unsigned char first_one[65536];
+int FirstOnet(unsigned long long arg1)
+{
+ if (arg1 >> 48)
+ return (first_one[arg1 >> 48]);
+ return 0;
+}
+
+
+The following code is currently generated:
+FirstOnet:
+ movl 8(%esp), %eax
+ cmpl $65536, %eax
+ movl 4(%esp), %ecx
+ jb .LBB1_2 # UnifiedReturnBlock
+.LBB1_1: # ifthen
+ shrl $16, %eax
+ movzbl first_one(%eax), %eax
+ ret
+.LBB1_2: # UnifiedReturnBlock
+ xorl %eax, %eax
+ ret
+
+There are a few possible improvements here:
+1. We should be able to eliminate the dead load into %ecx
+2. We could change the "movl 8(%esp), %eax" into
+ "movzwl 10(%esp), %eax"; this lets us change the cmpl
+ into a testl, which is shorter, and eliminate the shift.
+
+We could also in theory eliminate the branch by using a conditional
+for the address of the load, but that seems unlikely to be worthwhile
+in general.
+
+//===---------------------------------------------------------------------===//
+
+We compile this function:
+
+define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind {
+entry:
+ %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1]
+ br i1 %tmp2, label %bb7, label %bb
+
+bb: ; preds = %entry
+ %tmp6 = add i32 %b, %a ; <i32> [#uses=1]
+ ret i32 %tmp6
+
+bb7: ; preds = %entry
+ %tmp10 = sub i32 %a, %c ; <i32> [#uses=1]
+ ret i32 %tmp10
+}
+
+to:
+
+_foo:
+ cmpb $0, 16(%esp)
+ movl 12(%esp), %ecx
+ movl 8(%esp), %eax
+ movl 4(%esp), %edx
+ je LBB1_2 # bb7
+LBB1_1: # bb
+ addl %edx, %eax
+ ret
+LBB1_2: # bb7
+ movl %edx, %eax
+ subl %ecx, %eax
+ ret
+
+The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
+if it commuted the addl in LBB1_1.
+
+//===---------------------------------------------------------------------===//
+
+See rdar://4653682.
+
+From flops:
+
+LBB1_15: # bb310
+ cvtss2sd LCPI1_0, %xmm1
+ addsd %xmm1, %xmm0
+ movsd 176(%esp), %xmm2
+ mulsd %xmm0, %xmm2
+ movapd %xmm2, %xmm3
+ mulsd %xmm3, %xmm3
+ movapd %xmm3, %xmm4
+ mulsd LCPI1_23, %xmm4
+ addsd LCPI1_24, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_25, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_26, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_27, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd LCPI1_28, %xmm4
+ mulsd %xmm3, %xmm4
+ addsd %xmm1, %xmm4
+ mulsd %xmm2, %xmm4
+ movsd 152(%esp), %xmm1
+ addsd %xmm4, %xmm1
+ movsd %xmm1, 152(%esp)
+ incl %eax
+ cmpl %eax, %esi
+ jge LBB1_15 # bb310
+LBB1_16: # bb358.loopexit
+ movsd 152(%esp), %xmm0
+ addsd %xmm0, %xmm0
+ addsd LCPI1_22, %xmm0
+ movsd %xmm0, 152(%esp)
+
+Rather than spilling the result of the last addsd in the loop, we should have
+insert a copy to split the interval (one for the duration of the loop, one
+extending to the fall through). The register pressure in the loop isn't high
+enough to warrant the spill.
+
+Also check why xmm7 is not used at all in the function.
+
+//===---------------------------------------------------------------------===//
+
+Legalize loses track of the fact that bools are always zero extended when in
+memory. This causes us to compile abort_gzip (from 164.gzip) from:
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin8"
+@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2]
+define fastcc void @abort_gzip() noreturn nounwind {
+entry:
+ %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1]
+ br i1 %tmp.b.i, label %bb.i, label %bb4.i
+bb.i: ; preds = %entry
+ tail call void @exit( i32 1 ) noreturn nounwind
+ unreachable
+bb4.i: ; preds = %entry
+ store i1 true, i1* @in_exit.4870.b
+ tail call void @exit( i32 1 ) noreturn nounwind
+ unreachable
+}
+declare void @exit(i32) noreturn nounwind
+
+into:
+
+_abort_gzip:
+ subl $12, %esp
+ movb _in_exit.4870.b, %al
+ notb %al
+ testb $1, %al
+ jne LBB1_2 ## bb4.i
+LBB1_1: ## bb.i
+ ...
+
+//===---------------------------------------------------------------------===//
+
+We compile:
+
+int test(int x, int y) {
+ return x-y-1;
+}
+
+into (-m64):
+
+_test:
+ decl %edi
+ movl %edi, %eax
+ subl %esi, %eax
+ ret
+
+it would be better to codegen as: x+~y (notl+addl)
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+int foo(const char *str,...)
+{
+ __builtin_va_list a; int x;
+ __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
+ return x;
+}
+
+gets compiled into this on x86-64:
+ subq $200, %rsp
+ movaps %xmm7, 160(%rsp)
+ movaps %xmm6, 144(%rsp)
+ movaps %xmm5, 128(%rsp)
+ movaps %xmm4, 112(%rsp)
+ movaps %xmm3, 96(%rsp)
+ movaps %xmm2, 80(%rsp)
+ movaps %xmm1, 64(%rsp)
+ movaps %xmm0, 48(%rsp)
+ movq %r9, 40(%rsp)
+ movq %r8, 32(%rsp)
+ movq %rcx, 24(%rsp)
+ movq %rdx, 16(%rsp)
+ movq %rsi, 8(%rsp)
+ leaq (%rsp), %rax
+ movq %rax, 192(%rsp)
+ leaq 208(%rsp), %rax
+ movq %rax, 184(%rsp)
+ movl $48, 180(%rsp)
+ movl $8, 176(%rsp)
+ movl 176(%rsp), %eax
+ cmpl $47, %eax
+ jbe .LBB1_3 # bb
+.LBB1_1: # bb3
+ movq 184(%rsp), %rcx
+ leaq 8(%rcx), %rax
+ movq %rax, 184(%rsp)
+.LBB1_2: # bb4
+ movl (%rcx), %eax
+ addq $200, %rsp
+ ret
+.LBB1_3: # bb
+ movl %eax, %ecx
+ addl $8, %eax
+ addq 192(%rsp), %rcx
+ movl %eax, 176(%rsp)
+ jmp .LBB1_2 # bb4
+
+gcc 4.3 generates:
+ subq $96, %rsp
+.LCFI0:
+ leaq 104(%rsp), %rax
+ movq %rsi, -80(%rsp)
+ movl $8, -120(%rsp)
+ movq %rax, -112(%rsp)
+ leaq -88(%rsp), %rax
+ movq %rax, -104(%rsp)
+ movl $8, %eax
+ cmpl $48, %eax
+ jb .L6
+ movq -112(%rsp), %rdx
+ movl (%rdx), %eax
+ addq $96, %rsp
+ ret
+ .p2align 4,,10
+ .p2align 3
+.L6:
+ mov %eax, %edx
+ addq -104(%rsp), %rdx
+ addl $8, %eax
+ movl %eax, -120(%rsp)
+ movl (%rdx), %eax
+ addq $96, %rsp
+ ret
+
+and it gets compiled into this on x86:
+ pushl %ebp
+ movl %esp, %ebp
+ subl $4, %esp
+ leal 12(%ebp), %eax
+ movl %eax, -4(%ebp)
+ leal 16(%ebp), %eax
+ movl %eax, -4(%ebp)
+ movl 12(%ebp), %eax
+ addl $4, %esp
+ popl %ebp
+ ret
+
+gcc 4.3 generates:
+ pushl %ebp
+ movl %esp, %ebp
+ movl 12(%ebp), %eax
+ popl %ebp
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Teach tblgen not to check bitconvert source type in some cases. This allows us
+to consolidate the following patterns in X86InstrMMX.td:
+
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+
+There are other cases in various td files.
+
+//===---------------------------------------------------------------------===//
+
+Take something like the following on x86-32:
+unsigned a(unsigned long long x, unsigned y) {return x % y;}
+
+We currently generate a libcall, but we really shouldn't: the expansion is
+shorter and likely faster than the libcall. The expected code is something
+like the following:
+
+ movl 12(%ebp), %eax
+ movl 16(%ebp), %ecx
+ xorl %edx, %edx
+ divl %ecx
+ movl 8(%ebp), %eax
+ divl %ecx
+ movl %edx, %eax
+ ret
+
+A similar code sequence works for division.
+
+//===---------------------------------------------------------------------===//
+
+These should compile to the same code, but the later codegen's to useless
+instructions on X86. This may be a trivial dag combine (GCC PR7061):
+
+struct s1 { unsigned char a, b; };
+unsigned long f1(struct s1 x) {
+ return x.a + x.b;
+}
+struct s2 { unsigned a: 8, b: 8; };
+unsigned long f2(struct s2 x) {
+ return x.a + x.b;
+}
+
+//===---------------------------------------------------------------------===//
+
+We currently compile this:
+
+define i32 @func1(i32 %v1, i32 %v2) nounwind {
+entry:
+ %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+ %sum = extractvalue {i32, i1} %t, 0
+ %obit = extractvalue {i32, i1} %t, 1
+ br i1 %obit, label %overflow, label %normal
+normal:
+ ret i32 %sum
+overflow:
+ call void @llvm.trap()
+ unreachable
+}
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare void @llvm.trap()
+
+to:
+
+_func1:
+ movl 4(%esp), %eax
+ addl 8(%esp), %eax
+ jo LBB1_2 ## overflow
+LBB1_1: ## normal
+ ret
+LBB1_2: ## overflow
+ ud2
+
+it would be nice to produce "into" someday.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void vec_mpys1(int y[], const int x[], int scaler) {
+int i;
+for (i = 0; i < 150; i++)
+ y[i] += (((long long)scaler * (long long)x[i]) >> 31);
+}
+
+Compiles to this loop with GCC 3.x:
+
+.L5:
+ movl %ebx, %eax
+ imull (%edi,%ecx,4)
+ shrdl $31, %edx, %eax
+ addl %eax, (%esi,%ecx,4)
+ incl %ecx
+ cmpl $149, %ecx
+ jle .L5
+
+llvm-gcc compiles it to the much uglier:
+
+LBB1_1: ## bb1
+ movl 24(%esp), %eax
+ movl (%eax,%edi,4), %ebx
+ movl %ebx, %ebp
+ imull %esi, %ebp
+ movl %ebx, %eax
+ mull %ecx
+ addl %ebp, %edx
+ sarl $31, %ebx
+ imull %ecx, %ebx
+ addl %edx, %ebx
+ shldl $1, %eax, %ebx
+ movl 20(%esp), %eax
+ addl %ebx, (%eax,%edi,4)
+ incl %edi
+ cmpl $150, %edi
+ jne LBB1_1 ## bb1
+
+The issue is that we hoist the cast of "scaler" to long long outside of the
+loop, the value comes into the loop as two values, and
+RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
+constructed BUILD_PAIR which represents the cast value.
+
+//===---------------------------------------------------------------------===//
+
+Test instructions can be eliminated by using EFLAGS values from arithmetic
+instructions. This is currently not done for mul, and, or, xor, neg, shl,
+sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
+for read-modify-write instructions. It is also current not done if the
+OF or CF flags are needed.
+
+The shift operators have the complication that when the shift count is
+zero, EFLAGS is not set, so they can only subsume a test instruction if
+the shift count is known to be non-zero. Also, using the EFLAGS value
+from a shift is apparently very slow on some x86 implementations.
+
+In read-modify-write instructions, the root node in the isel match is
+the store, and isel has no way for the use of the EFLAGS result of the
+arithmetic to be remapped to the new node.
+
+Add and subtract instructions set OF on signed overflow and CF on unsiged
+overflow, while test instructions always clear OF and CF. In order to
+replace a test with an add or subtract in a situation where OF or CF is
+needed, codegen must be able to prove that the operation cannot see
+signed or unsigned overflow, respectively.
+
+//===---------------------------------------------------------------------===//
+
+memcpy/memmove do not lower to SSE copies when possible. A silly example is:
+define <16 x float> @foo(<16 x float> %A) nounwind {
+ %tmp = alloca <16 x float>, align 16
+ %tmp2 = alloca <16 x float>, align 16
+ store <16 x float> %A, <16 x float>* %tmp
+ %s = bitcast <16 x float>* %tmp to i8*
+ %s2 = bitcast <16 x float>* %tmp2 to i8*
+ call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
+ %R = load <16 x float>* %tmp2
+ ret <16 x float> %R
+}
+
+declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+
+which compiles to:
+
+_foo:
+ subl $140, %esp
+ movaps %xmm3, 112(%esp)
+ movaps %xmm2, 96(%esp)
+ movaps %xmm1, 80(%esp)
+ movaps %xmm0, 64(%esp)
+ movl 60(%esp), %eax
+ movl %eax, 124(%esp)
+ movl 56(%esp), %eax
+ movl %eax, 120(%esp)
+ movl 52(%esp), %eax
+ <many many more 32-bit copies>
+ movaps (%esp), %xmm0
+ movaps 16(%esp), %xmm1
+ movaps 32(%esp), %xmm2
+ movaps 48(%esp), %xmm3
+ addl $140, %esp
+ ret
+
+On Nehalem, it may even be cheaper to just use movups when unaligned than to
+fall back to lower-granularity chunks.
+
+//===---------------------------------------------------------------------===//
+
+Implement processor-specific optimizations for parity with GCC on these
+processors. GCC does two optimizations:
+
+1. ix86_pad_returns inserts a noop before ret instructions if immediately
+ preceeded by a conditional branch or is the target of a jump.
+2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
+ code contains more than 3 branches.
+
+The first one is done for all AMDs, Core2, and "Generic"
+The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
+ Core 2, and "Generic"
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int a(int x) { return (x & 127) > 31; }
+
+Current output:
+ movl 4(%esp), %eax
+ andl $127, %eax
+ cmpl $31, %eax
+ seta %al
+ movzbl %al, %eax
+ ret
+
+Ideal output:
+ xorl %eax, %eax
+ testl $96, 4(%esp)
+ setne %al
+ ret
+
+This should definitely be done in instcombine, canonicalizing the range
+condition into a != condition. We get this IR:
+
+define i32 @a(i32 %x) nounwind readnone {
+entry:
+ %0 = and i32 %x, 127 ; <i32> [#uses=1]
+ %1 = icmp ugt i32 %0, 31 ; <i1> [#uses=1]
+ %2 = zext i1 %1 to i32 ; <i32> [#uses=1]
+ ret i32 %2
+}
+
+Instcombine prefers to strength reduce relational comparisons to equality
+comparisons when possible, this should be another case of that. This could
+be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
+looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
+be redesigned to use ComputeMaskedBits and friends.
+
+
+//===---------------------------------------------------------------------===//
+Testcase:
+int x(int a) { return (a&0xf0)>>4; }
+
+Current output:
+ movl 4(%esp), %eax
+ shrl $4, %eax
+ andl $15, %eax
+ ret
+
+Ideal output:
+ movzbl 4(%esp), %eax
+ shrl $4, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int x(int a) { return (a & 0x80) ? 0x100 : 0; }
+int y(int a) { return (a & 0x80) *2; }
+
+Current:
+ testl $128, 4(%esp)
+ setne %al
+ movzbl %al, %eax
+ shll $8, %eax
+ ret
+
+Better:
+ movl 4(%esp), %eax
+ addl %eax, %eax
+ andl $256, %eax
+ ret
+
+This is another general instcombine transformation that is profitable on all
+targets. In LLVM IR, these functions look like this:
+
+define i32 @x(i32 %a) nounwind readnone {
+entry:
+ %0 = and i32 %a, 128
+ %1 = icmp eq i32 %0, 0
+ %iftmp.0.0 = select i1 %1, i32 0, i32 256
+ ret i32 %iftmp.0.0
+}
+
+define i32 @y(i32 %a) nounwind readnone {
+entry:
+ %0 = shl i32 %a, 1
+ %1 = and i32 %0, 256
+ ret i32 %1
+}
+
+Replacing an icmp+select with a shift should always be considered profitable in
+instcombine.
+
+//===---------------------------------------------------------------------===//
+
+Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
+properly.
+
+When the return value is not used (i.e. only care about the value in the
+memory), x86 does not have to use add to implement these. Instead, it can use
+add, sub, inc, dec instructions with the "lock" prefix.
+
+This is currently implemented using a bit of instruction selection trick. The
+issue is the target independent pattern produces one output and a chain and we
+want to map it into one that just output a chain. The current trick is to select
+it into a MERGE_VALUES with the first definition being an implicit_def. The
+proper solution is to add new ISD opcodes for the no-output variant. DAG
+combiner can then transform the node before it gets to target node selection.
+
+Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
+fact these instructions are identical to the non-lock versions. We need a way to
+add target specific information to target nodes and have this information
+carried over to machine instructions. Asm printer (or JIT) can use this
+information to add the "lock" prefix.
+
+//===---------------------------------------------------------------------===//
+
+_Bool bar(int *x) { return *x & 1; }
+
+define zeroext i1 @bar(i32* nocapture %x) nounwind readonly {
+entry:
+ %tmp1 = load i32* %x ; <i32> [#uses=1]
+ %and = and i32 %tmp1, 1 ; <i32> [#uses=1]
+ %tobool = icmp ne i32 %and, 0 ; <i1> [#uses=1]
+ ret i1 %tobool
+}
+
+bar: # @bar
+# BB#0: # %entry
+ movl 4(%esp), %eax
+ movb (%eax), %al
+ andb $1, %al
+ movzbl %al, %eax
+ ret
+
+Missed optimization: should be movl+andl.
+
+//===---------------------------------------------------------------------===//
+
+Consider the following two functions compiled with clang:
+_Bool foo(int *x) { return !(*x & 4); }
+unsigned bar(int *x) { return !(*x & 4); }
+
+foo:
+ movl 4(%esp), %eax
+ testb $4, (%eax)
+ sete %al
+ movzbl %al, %eax
+ ret
+
+bar:
+ movl 4(%esp), %eax
+ movl (%eax), %eax
+ shrl $2, %eax
+ andl $1, %eax
+ xorl $1, %eax
+ ret
+
+The second function generates more code even though the two functions are
+are functionally identical.
+
+//===---------------------------------------------------------------------===//
+
+Take the following C code:
+int x(int y) { return (y & 63) << 14; }
+
+Code produced by gcc:
+ andl $63, %edi
+ sall $14, %edi
+ movl %edi, %eax
+ ret
+
+Code produced by clang:
+ shll $14, %edi
+ movl %edi, %eax
+ andl $1032192, %eax
+ ret
+
+The code produced by gcc is 3 bytes shorter. This sort of construct often
+shows up with bitfields.
+
+//===---------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/X86/SSEDomainFix.cpp b/contrib/llvm/lib/Target/X86/SSEDomainFix.cpp
new file mode 100644
index 0000000..dab070e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/SSEDomainFix.cpp
@@ -0,0 +1,506 @@
+//===- SSEDomainFix.cpp - Use proper int/float domain for SSE ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SSEDomainFix pass.
+//
+// Some SSE instructions like mov, and, or, xor are available in different
+// variants for different operand types. These variant instructions are
+// equivalent, but on Nehalem and newer cpus there is extra latency
+// transferring data between integer and floating point domains.
+//
+// This pass changes the variant instructions to minimize domain crossings.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sse-domain-fix"
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track
+/// of execution domains.
+///
+/// An open DomainValue represents a set of instructions that can still switch
+/// execution domain. Multiple registers may refer to the same open
+/// DomainValue - they will eventually be collapsed to the same execution
+/// domain.
+///
+/// A collapsed DomainValue represents a single register that has been forced
+/// into one of more execution domains. There is a separate collapsed
+/// DomainValue for each register, but it may contain multiple execution
+/// domains. A register value is initially created in a single execution
+/// domain, but if we were forced to pay the penalty of a domain crossing, we
+/// keep track of the fact the the register is now available in multiple
+/// domains.
+namespace {
+struct DomainValue {
+ // Basic reference counting.
+ unsigned Refs;
+
+ // Bitmask of available domains. For an open DomainValue, it is the still
+ // possible domains for collapsing. For a collapsed DomainValue it is the
+ // domains where the register is available for free.
+ unsigned AvailableDomains;
+
+ // Position of the last defining instruction.
+ unsigned Dist;
+
+ // Twiddleable instructions using or defining these registers.
+ SmallVector<MachineInstr*, 8> Instrs;
+
+ // A collapsed DomainValue has no instructions to twiddle - it simply keeps
+ // track of the domains where the registers are already available.
+ bool isCollapsed() const { return Instrs.empty(); }
+
+ // Is domain available?
+ bool hasDomain(unsigned domain) const {
+ return AvailableDomains & (1u << domain);
+ }
+
+ // Mark domain as available.
+ void addDomain(unsigned domain) {
+ AvailableDomains |= 1u << domain;
+ }
+
+ // Restrict to a single domain available.
+ void setSingleDomain(unsigned domain) {
+ AvailableDomains = 1u << domain;
+ }
+
+ // Return bitmask of domains that are available and in mask.
+ unsigned getCommonDomains(unsigned mask) const {
+ return AvailableDomains & mask;
+ }
+
+ // First domain available.
+ unsigned getFirstDomain() const {
+ return CountTrailingZeros_32(AvailableDomains);
+ }
+
+ DomainValue() { clear(); }
+
+ void clear() {
+ Refs = AvailableDomains = Dist = 0;
+ Instrs.clear();
+ }
+};
+}
+
+static const unsigned NumRegs = 16;
+
+namespace {
+class SSEDomainFixPass : public MachineFunctionPass {
+ static char ID;
+ SpecificBumpPtrAllocator<DomainValue> Allocator;
+ SmallVector<DomainValue*,16> Avail;
+
+ MachineFunction *MF;
+ const X86InstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ MachineBasicBlock *MBB;
+ DomainValue **LiveRegs;
+ typedef DenseMap<MachineBasicBlock*,DomainValue**> LiveOutMap;
+ LiveOutMap LiveOuts;
+ unsigned Distance;
+
+public:
+ SSEDomainFixPass() : MachineFunctionPass(&ID) {}
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const {
+ return "SSE execution domain fixup";
+ }
+
+private:
+ // Register mapping.
+ int RegIndex(unsigned Reg);
+
+ // DomainValue allocation.
+ DomainValue *Alloc(int domain = -1);
+ void Recycle(DomainValue*);
+
+ // LiveRegs manipulations.
+ void SetLiveReg(int rx, DomainValue *DV);
+ void Kill(int rx);
+ void Force(int rx, unsigned domain);
+ void Collapse(DomainValue *dv, unsigned domain);
+ bool Merge(DomainValue *A, DomainValue *B);
+
+ void enterBasicBlock();
+ void visitGenericInstr(MachineInstr*);
+ void visitSoftInstr(MachineInstr*, unsigned mask);
+ void visitHardInstr(MachineInstr*, unsigned domain);
+};
+}
+
+char SSEDomainFixPass::ID = 0;
+
+/// Translate TRI register number to an index into our smaller tables of
+/// interesting registers. Return -1 for boring registers.
+int SSEDomainFixPass::RegIndex(unsigned reg) {
+ assert(X86::XMM15 == X86::XMM0+NumRegs-1 && "Unexpected sort");
+ reg -= X86::XMM0;
+ return reg < NumRegs ? (int) reg : -1;
+}
+
+DomainValue *SSEDomainFixPass::Alloc(int domain) {
+ DomainValue *dv = Avail.empty() ?
+ new(Allocator.Allocate()) DomainValue :
+ Avail.pop_back_val();
+ dv->Dist = Distance;
+ if (domain >= 0)
+ dv->addDomain(domain);
+ return dv;
+}
+
+void SSEDomainFixPass::Recycle(DomainValue *dv) {
+ assert(dv && "Cannot recycle NULL");
+ dv->clear();
+ Avail.push_back(dv);
+}
+
+/// Set LiveRegs[rx] = dv, updating reference counts.
+void SSEDomainFixPass::SetLiveReg(int rx, DomainValue *dv) {
+ assert(unsigned(rx) < NumRegs && "Invalid index");
+ if (!LiveRegs) {
+ LiveRegs = new DomainValue*[NumRegs];
+ std::fill(LiveRegs, LiveRegs+NumRegs, (DomainValue*)0);
+ }
+
+ if (LiveRegs[rx] == dv)
+ return;
+ if (LiveRegs[rx]) {
+ assert(LiveRegs[rx]->Refs && "Bad refcount");
+ if (--LiveRegs[rx]->Refs == 0) Recycle(LiveRegs[rx]);
+ }
+ LiveRegs[rx] = dv;
+ if (dv) ++dv->Refs;
+}
+
+// Kill register rx, recycle or collapse any DomainValue.
+void SSEDomainFixPass::Kill(int rx) {
+ assert(unsigned(rx) < NumRegs && "Invalid index");
+ if (!LiveRegs || !LiveRegs[rx]) return;
+
+ // Before killing the last reference to an open DomainValue, collapse it to
+ // the first available domain.
+ if (LiveRegs[rx]->Refs == 1 && !LiveRegs[rx]->isCollapsed())
+ Collapse(LiveRegs[rx], LiveRegs[rx]->getFirstDomain());
+ else
+ SetLiveReg(rx, 0);
+}
+
+/// Force register rx into domain.
+void SSEDomainFixPass::Force(int rx, unsigned domain) {
+ assert(unsigned(rx) < NumRegs && "Invalid index");
+ DomainValue *dv;
+ if (LiveRegs && (dv = LiveRegs[rx])) {
+ if (dv->isCollapsed())
+ dv->addDomain(domain);
+ else if (dv->hasDomain(domain))
+ Collapse(dv, domain);
+ else {
+ // This is an incompatible open DomainValue. Collapse it to whatever and force
+ // the new value into domain. This costs a domain crossing.
+ Collapse(dv, dv->getFirstDomain());
+ assert(LiveRegs[rx] && "Not live after collapse?");
+ LiveRegs[rx]->addDomain(domain);
+ }
+ } else {
+ // Set up basic collapsed DomainValue.
+ SetLiveReg(rx, Alloc(domain));
+ }
+}
+
+/// Collapse open DomainValue into given domain. If there are multiple
+/// registers using dv, they each get a unique collapsed DomainValue.
+void SSEDomainFixPass::Collapse(DomainValue *dv, unsigned domain) {
+ assert(dv->hasDomain(domain) && "Cannot collapse");
+
+ // Collapse all the instructions.
+ while (!dv->Instrs.empty())
+ TII->SetSSEDomain(dv->Instrs.pop_back_val(), domain);
+ dv->setSingleDomain(domain);
+
+ // If there are multiple users, give them new, unique DomainValues.
+ if (LiveRegs && dv->Refs > 1)
+ for (unsigned rx = 0; rx != NumRegs; ++rx)
+ if (LiveRegs[rx] == dv)
+ SetLiveReg(rx, Alloc(domain));
+}
+
+/// Merge - All instructions and registers in B are moved to A, and B is
+/// released.
+bool SSEDomainFixPass::Merge(DomainValue *A, DomainValue *B) {
+ assert(!A->isCollapsed() && "Cannot merge into collapsed");
+ assert(!B->isCollapsed() && "Cannot merge from collapsed");
+ if (A == B)
+ return true;
+ // Restrict to the domains that A and B have in common.
+ unsigned common = A->getCommonDomains(B->AvailableDomains);
+ if (!common)
+ return false;
+ A->AvailableDomains = common;
+ A->Dist = std::max(A->Dist, B->Dist);
+ A->Instrs.append(B->Instrs.begin(), B->Instrs.end());
+ for (unsigned rx = 0; rx != NumRegs; ++rx)
+ if (LiveRegs[rx] == B)
+ SetLiveReg(rx, A);
+ return true;
+}
+
+void SSEDomainFixPass::enterBasicBlock() {
+ // Try to coalesce live-out registers from predecessors.
+ for (MachineBasicBlock::livein_iterator i = MBB->livein_begin(),
+ e = MBB->livein_end(); i != e; ++i) {
+ int rx = RegIndex(*i);
+ if (rx < 0) continue;
+ for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(),
+ pe = MBB->pred_end(); pi != pe; ++pi) {
+ LiveOutMap::const_iterator fi = LiveOuts.find(*pi);
+ if (fi == LiveOuts.end()) continue;
+ DomainValue *pdv = fi->second[rx];
+ if (!pdv) continue;
+ if (!LiveRegs || !LiveRegs[rx]) {
+ SetLiveReg(rx, pdv);
+ continue;
+ }
+
+ // We have a live DomainValue from more than one predecessor.
+ if (LiveRegs[rx]->isCollapsed()) {
+ // We are already collapsed, but predecessor is not. Force him.
+ unsigned domain = LiveRegs[rx]->getFirstDomain();
+ if (!pdv->isCollapsed() && pdv->hasDomain(domain))
+ Collapse(pdv, domain);
+ continue;
+ }
+
+ // Currently open, merge in predecessor.
+ if (!pdv->isCollapsed())
+ Merge(LiveRegs[rx], pdv);
+ else
+ Force(rx, pdv->getFirstDomain());
+ }
+ }
+}
+
+// A hard instruction only works in one domain. All input registers will be
+// forced into that domain.
+void SSEDomainFixPass::visitHardInstr(MachineInstr *mi, unsigned domain) {
+ // Collapse all uses.
+ for (unsigned i = mi->getDesc().getNumDefs(),
+ e = mi->getDesc().getNumOperands(); i != e; ++i) {
+ MachineOperand &mo = mi->getOperand(i);
+ if (!mo.isReg()) continue;
+ int rx = RegIndex(mo.getReg());
+ if (rx < 0) continue;
+ Force(rx, domain);
+ }
+
+ // Kill all defs and force them.
+ for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
+ MachineOperand &mo = mi->getOperand(i);
+ if (!mo.isReg()) continue;
+ int rx = RegIndex(mo.getReg());
+ if (rx < 0) continue;
+ Kill(rx);
+ Force(rx, domain);
+ }
+}
+
+// A soft instruction can be changed to work in other domains given by mask.
+void SSEDomainFixPass::visitSoftInstr(MachineInstr *mi, unsigned mask) {
+ // Bitmask of available domains for this instruction after taking collapsed
+ // operands into account.
+ unsigned available = mask;
+
+ // Scan the explicit use operands for incoming domains.
+ SmallVector<int, 4> used;
+ if (LiveRegs)
+ for (unsigned i = mi->getDesc().getNumDefs(),
+ e = mi->getDesc().getNumOperands(); i != e; ++i) {
+ MachineOperand &mo = mi->getOperand(i);
+ if (!mo.isReg()) continue;
+ int rx = RegIndex(mo.getReg());
+ if (rx < 0) continue;
+ if (DomainValue *dv = LiveRegs[rx]) {
+ // Bitmask of domains that dv and available have in common.
+ unsigned common = dv->getCommonDomains(available);
+ // Is it possible to use this collapsed register for free?
+ if (dv->isCollapsed()) {
+ // Restrict available domains to the ones in common with the operand.
+ // If there are no common domains, we must pay the cross-domain
+ // penalty for this operand.
+ if (common) available = common;
+ } else if (common)
+ // Open DomainValue is compatible, save it for merging.
+ used.push_back(rx);
+ else
+ // Open DomainValue is not compatible with instruction. It is useless
+ // now.
+ Kill(rx);
+ }
+ }
+
+ // If the collapsed operands force a single domain, propagate the collapse.
+ if (isPowerOf2_32(available)) {
+ unsigned domain = CountTrailingZeros_32(available);
+ TII->SetSSEDomain(mi, domain);
+ visitHardInstr(mi, domain);
+ return;
+ }
+
+ // Kill off any remaining uses that don't match available, and build a list of
+ // incoming DomainValues that we want to merge.
+ SmallVector<DomainValue*,4> doms;
+ for (SmallVector<int, 4>::iterator i=used.begin(), e=used.end(); i!=e; ++i) {
+ int rx = *i;
+ DomainValue *dv = LiveRegs[rx];
+ // This useless DomainValue could have been missed above.
+ if (!dv->getCommonDomains(available)) {
+ Kill(*i);
+ continue;
+ }
+ // sorted, uniqued insert.
+ bool inserted = false;
+ for (SmallVector<DomainValue*,4>::iterator i = doms.begin(), e = doms.end();
+ i != e && !inserted; ++i) {
+ if (dv == *i)
+ inserted = true;
+ else if (dv->Dist < (*i)->Dist) {
+ inserted = true;
+ doms.insert(i, dv);
+ }
+ }
+ if (!inserted)
+ doms.push_back(dv);
+ }
+
+ // doms are now sorted in order of appearance. Try to merge them all, giving
+ // priority to the latest ones.
+ DomainValue *dv = 0;
+ while (!doms.empty()) {
+ if (!dv) {
+ dv = doms.pop_back_val();
+ continue;
+ }
+
+ DomainValue *latest = doms.pop_back_val();
+ if (Merge(dv, latest)) continue;
+
+ // If latest didn't merge, it is useless now. Kill all registers using it.
+ for (SmallVector<int,4>::iterator i=used.begin(), e=used.end(); i != e; ++i)
+ if (LiveRegs[*i] == latest)
+ Kill(*i);
+ }
+
+ // dv is the DomainValue we are going to use for this instruction.
+ if (!dv)
+ dv = Alloc();
+ dv->Dist = Distance;
+ dv->AvailableDomains = available;
+ dv->Instrs.push_back(mi);
+
+ // Finally set all defs and non-collapsed uses to dv.
+ for (unsigned i = 0, e = mi->getDesc().getNumOperands(); i != e; ++i) {
+ MachineOperand &mo = mi->getOperand(i);
+ if (!mo.isReg()) continue;
+ int rx = RegIndex(mo.getReg());
+ if (rx < 0) continue;
+ if (!LiveRegs || !LiveRegs[rx] || (mo.isDef() && LiveRegs[rx]!=dv)) {
+ Kill(rx);
+ SetLiveReg(rx, dv);
+ }
+ }
+}
+
+void SSEDomainFixPass::visitGenericInstr(MachineInstr *mi) {
+ // Process explicit defs, kill any XMM registers redefined.
+ for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
+ MachineOperand &mo = mi->getOperand(i);
+ if (!mo.isReg()) continue;
+ int rx = RegIndex(mo.getReg());
+ if (rx < 0) continue;
+ Kill(rx);
+ }
+}
+
+bool SSEDomainFixPass::runOnMachineFunction(MachineFunction &mf) {
+ MF = &mf;
+ TII = static_cast<const X86InstrInfo*>(MF->getTarget().getInstrInfo());
+ TRI = MF->getTarget().getRegisterInfo();
+ MBB = 0;
+ LiveRegs = 0;
+ Distance = 0;
+ assert(NumRegs == X86::VR128RegClass.getNumRegs() && "Bad regclass");
+
+ // If no XMM registers are used in the function, we can skip it completely.
+ bool anyregs = false;
+ for (TargetRegisterClass::const_iterator I = X86::VR128RegClass.begin(),
+ E = X86::VR128RegClass.end(); I != E; ++I)
+ if (MF->getRegInfo().isPhysRegUsed(*I)) {
+ anyregs = true;
+ break;
+ }
+ if (!anyregs) return false;
+
+ MachineBasicBlock *Entry = MF->begin();
+ SmallPtrSet<MachineBasicBlock*, 16> Visited;
+ for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 16> >
+ DFI = df_ext_begin(Entry, Visited), DFE = df_ext_end(Entry, Visited);
+ DFI != DFE; ++DFI) {
+ MBB = *DFI;
+ enterBasicBlock();
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+ ++I) {
+ MachineInstr *mi = I;
+ if (mi->isDebugValue()) continue;
+ ++Distance;
+ std::pair<uint16_t, uint16_t> domp = TII->GetSSEDomain(mi);
+ if (domp.first)
+ if (domp.second)
+ visitSoftInstr(mi, domp.second);
+ else
+ visitHardInstr(mi, domp.first);
+ else if (LiveRegs)
+ visitGenericInstr(mi);
+ }
+
+ // Save live registers at end of MBB - used by enterBasicBlock().
+ if (LiveRegs)
+ LiveOuts.insert(std::make_pair(MBB, LiveRegs));
+ LiveRegs = 0;
+ }
+
+ // Clear the LiveOuts vectors. Should we also collapse any remaining
+ // DomainValues?
+ for (LiveOutMap::const_iterator i = LiveOuts.begin(), e = LiveOuts.end();
+ i != e; ++i)
+ delete[] i->second;
+ LiveOuts.clear();
+ Avail.clear();
+ Allocator.DestroyAll();
+
+ return false;
+}
+
+FunctionPass *llvm::createSSEDomainFixPass() {
+ return new SSEDomainFixPass();
+}
diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/CMakeLists.txt b/contrib/llvm/lib/Target/X86/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..90be9f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMX86Info
+ X86TargetInfo.cpp
+ )
+
+add_dependencies(LLVMX86Info X86CodeGenTable_gen)
diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/Makefile b/contrib/llvm/lib/Target/X86/TargetInfo/Makefile
new file mode 100644
index 0000000..ee91982
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/TargetInfo/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/X86/TargetInfo/Makefile ------------------*- Makefile -*-===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMX86Info
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
new file mode 100644
index 0000000..08d4d84
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheX86_32Target, llvm::TheX86_64Target;
+
+extern "C" void LLVMInitializeX86TargetInfo() {
+ RegisterTarget<Triple::x86, /*HasJIT=*/true>
+ X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above");
+
+ RegisterTarget<Triple::x86_64, /*HasJIT=*/true>
+ Y(TheX86_64Target, "x86-64", "64-bit X86: EM64T and AMD64");
+}
diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h
new file mode 100644
index 0000000..22e89a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.h
@@ -0,0 +1,91 @@
+//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the x86
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_X86_H
+#define TARGET_X86_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class FunctionPass;
+class JITCodeEmitter;
+class MCCodeEmitter;
+class MCContext;
+class MachineCodeEmitter;
+class Target;
+class TargetAsmBackend;
+class X86TargetMachine;
+class formatted_raw_ostream;
+
+/// createX86ISelDag - This pass converts a legalized DAG into a
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *createX86ISelDag(X86TargetMachine &TM,
+ CodeGenOpt::Level OptLevel);
+
+/// createX86FloatingPointStackifierPass - This function returns a pass which
+/// converts floating point register references and pseudo instructions into
+/// floating point stack references and physical instructions.
+///
+FunctionPass *createX86FloatingPointStackifierPass();
+
+/// createSSEDomainFixPass - This pass twiddles SSE opcodes to prevent domain
+/// crossings.
+FunctionPass *createSSEDomainFixPass();
+
+/// createX87FPRegKillInserterPass - This function returns a pass which
+/// inserts FP_REG_KILL instructions where needed.
+///
+FunctionPass *createX87FPRegKillInserterPass();
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified MCE object.
+FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
+ JITCodeEmitter &JCE);
+
+MCCodeEmitter *createX86_32MCCodeEmitter(const Target &, TargetMachine &TM,
+ MCContext &Ctx);
+MCCodeEmitter *createX86_64MCCodeEmitter(const Target &, TargetMachine &TM,
+ MCContext &Ctx);
+
+TargetAsmBackend *createX86_32AsmBackend(const Target &, const std::string &);
+TargetAsmBackend *createX86_64AsmBackend(const Target &, const std::string &);
+
+/// createX86EmitCodeToMemory - Returns a pass that converts a register
+/// allocated function into raw machine code in a dynamically
+/// allocated chunk of memory.
+///
+FunctionPass *createEmitX86CodeToMemory();
+
+/// createX86MaxStackAlignmentHeuristicPass - This function returns a pass
+/// which determines whether the frame pointer register should be
+/// reserved in case dynamic stack alignment is later required.
+///
+FunctionPass *createX86MaxStackAlignmentHeuristicPass();
+
+extern Target TheX86_32Target, TheX86_64Target;
+
+} // End llvm namespace
+
+// Defines symbolic names for X86 registers. This defines a mapping from
+// register name to register number.
+//
+#include "X86GenRegisterNames.inc"
+
+// Defines symbolic names for the X86 instructions.
+//
+#include "X86GenInstrNames.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td
new file mode 100644
index 0000000..a53f973
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.td
@@ -0,0 +1,212 @@
+//===- X86.td - Target definition file for the Intel X86 ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel i386 architecture, refered to
+// here as the "X86" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget features.
+//===----------------------------------------------------------------------===//
+
+def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true",
+ "Enable conditional move instructions">;
+
+
+def FeatureMMX : SubtargetFeature<"mmx","X86SSELevel", "MMX",
+ "Enable MMX instructions">;
+def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
+ "Enable SSE instructions",
+ // SSE codegen depends on cmovs, and all
+ // SSE1+ processors support them.
+ [FeatureMMX, FeatureCMOV]>;
+def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
+ "Enable SSE2 instructions",
+ [FeatureSSE1]>;
+def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
+ "Enable SSE3 instructions",
+ [FeatureSSE2]>;
+def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
+ "Enable SSSE3 instructions",
+ [FeatureSSE3]>;
+def FeatureSSE41 : SubtargetFeature<"sse41", "X86SSELevel", "SSE41",
+ "Enable SSE 4.1 instructions",
+ [FeatureSSSE3]>;
+def FeatureSSE42 : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
+ "Enable SSE 4.2 instructions",
+ [FeatureSSE41]>;
+def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
+ "Enable 3DNow! instructions">;
+def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
+ "Enable 3DNow! Athlon instructions",
+ [Feature3DNow]>;
+// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
+// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
+// without disabling 64-bit mode.
+def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
+ "Support 64-bit instructions",
+ [FeatureCMOV]>;
+def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
+ "Bit testing of memory is slow">;
+def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
+ "IsUAMemFast", "true",
+ "Fast unaligned memory access">;
+def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true",
+ "Support SSE 4a instructions">;
+
+def FeatureAVX : SubtargetFeature<"avx", "HasAVX", "true",
+ "Enable AVX instructions">;
+def FeatureFMA3 : SubtargetFeature<"fma3", "HasFMA3", "true",
+ "Enable three-operand fused multiple-add">;
+def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true",
+ "Enable four-operand fused multiple-add">;
+def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
+ "HasVectorUAMem", "true",
+ "Allow unaligned memory operands on vector/SIMD instructions">;
+def FeatureAES : SubtargetFeature<"aes", "HasAES", "true",
+ "Enable AES instructions">;
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic", []>;
+def : Proc<"i386", []>;
+def : Proc<"i486", []>;
+def : Proc<"i586", []>;
+def : Proc<"pentium", []>;
+def : Proc<"pentium-mmx", [FeatureMMX]>;
+def : Proc<"i686", []>;
+def : Proc<"pentiumpro", [FeatureCMOV]>;
+def : Proc<"pentium2", [FeatureMMX, FeatureCMOV]>;
+def : Proc<"pentium3", [FeatureSSE1]>;
+def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>;
+def : Proc<"pentium4", [FeatureSSE2]>;
+def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem]>;
+def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>;
+def : Proc<"nocona", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"core2", [FeatureSSSE3, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"penryn", [FeatureSSE41, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"atom", [FeatureSSE3, Feature64Bit, FeatureSlowBTMem]>;
+// "Arrandale" along with corei3 and corei5
+def : Proc<"corei7", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem,
+ FeatureFastUAMem, FeatureAES]>;
+def : Proc<"nehalem", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem,
+ FeatureFastUAMem]>;
+// Westmere is a similar machine to nehalem with some additional features.
+// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
+def : Proc<"westmere", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem,
+ FeatureFastUAMem, FeatureAES]>;
+// Sandy Bridge does not have FMA
+// FIXME: Wikipedia says it does... it should have AES as well.
+def : Proc<"sandybridge", [FeatureSSE42, FeatureAVX, Feature64Bit]>;
+
+def : Proc<"k6", [FeatureMMX]>;
+def : Proc<"k6-2", [FeatureMMX, Feature3DNow]>;
+def : Proc<"k6-3", [FeatureMMX, Feature3DNow]>;
+def : Proc<"athlon", [FeatureMMX, Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-tbird", [FeatureMMX, Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"k8", [FeatureSSE2, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"opteron", [FeatureSSE2, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"athlon64", [FeatureSSE2, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"athlon-fx", [FeatureSSE2, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"k8-sse3", [FeatureSSE3, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, Feature64Bit,
+ FeatureSlowBTMem]>;
+def : Proc<"amdfam10", [FeatureSSE3, FeatureSSE4A,
+ Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"barcelona", [FeatureSSE3, FeatureSSE4A,
+ Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"istanbul", [Feature3DNowA, Feature64Bit, FeatureSSE4A,
+ Feature3DNowA]>;
+def : Proc<"shanghai", [Feature3DNowA, Feature64Bit, FeatureSSE4A,
+ Feature3DNowA]>;
+
+def : Proc<"winchip-c6", [FeatureMMX]>;
+def : Proc<"winchip2", [FeatureMMX, Feature3DNow]>;
+def : Proc<"c3", [FeatureMMX, Feature3DNow]>;
+def : Proc<"c3-2", [FeatureSSE1]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "X86InstrInfo.td"
+
+def X86InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "X86CallingConv.td"
+
+
+//===----------------------------------------------------------------------===//
+// Assembly Printers
+//===----------------------------------------------------------------------===//
+
+// Currently the X86 assembly parser only supports ATT syntax.
+def ATTAsmParser : AsmParser {
+ string AsmParserClassName = "ATTAsmParser";
+ string AsmParserInstCleanup = "InstructionCleanup";
+ string MatchInstructionName = "MatchInstructionImpl";
+ int Variant = 0;
+
+ // Discard comments in assembly strings.
+ string CommentDelimiter = "#";
+
+ // Recognize hard coded registers.
+ string RegisterPrefix = "%";
+}
+
+// The X86 target supports two different syntaxes for emitting machine code.
+// This is controlled by the -x86-asm-syntax={att|intel}
+def ATTAsmWriter : AsmWriter {
+ string AsmWriterClassName = "ATTInstPrinter";
+ int Variant = 0;
+}
+def IntelAsmWriter : AsmWriter {
+ string AsmWriterClassName = "IntelInstPrinter";
+ int Variant = 1;
+}
+
+def X86 : Target {
+ // Information about the instructions...
+ let InstructionSet = X86InstrInfo;
+
+ let AssemblyParsers = [ATTAsmParser];
+
+ let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/X86AsmBackend.cpp
new file mode 100644
index 0000000..151087f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AsmBackend.cpp
@@ -0,0 +1,304 @@
+//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetAsmBackend.h"
+#include "X86.h"
+#include "X86FixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MachObjectWriter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Target/TargetAsmBackend.h"
+using namespace llvm;
+
+namespace {
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+ switch (Kind) {
+ default: assert(0 && "invalid fixup kind!");
+ case X86::reloc_pcrel_1byte:
+ case FK_Data_1: return 0;
+ case FK_Data_2: return 1;
+ case X86::reloc_pcrel_4byte:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_movq_load:
+ case FK_Data_4: return 2;
+ case FK_Data_8: return 3;
+ }
+}
+
+class X86AsmBackend : public TargetAsmBackend {
+public:
+ X86AsmBackend(const Target &T)
+ : TargetAsmBackend(T) {}
+
+ void ApplyFixup(const MCFixup &Fixup, MCDataFragment &DF,
+ uint64_t Value) const {
+ unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
+
+ assert(Fixup.getOffset() + Size <= DF.getContents().size() &&
+ "Invalid fixup offset!");
+ for (unsigned i = 0; i != Size; ++i)
+ DF.getContents()[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
+ }
+
+ bool MayNeedRelaxation(const MCInst &Inst) const;
+
+ void RelaxInstruction(const MCInst &Inst, MCInst &Res) const;
+
+ bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
+};
+
+static unsigned getRelaxedOpcode(unsigned Op) {
+ switch (Op) {
+ default:
+ return Op;
+
+ case X86::JAE_1: return X86::JAE_4;
+ case X86::JA_1: return X86::JA_4;
+ case X86::JBE_1: return X86::JBE_4;
+ case X86::JB_1: return X86::JB_4;
+ case X86::JE_1: return X86::JE_4;
+ case X86::JGE_1: return X86::JGE_4;
+ case X86::JG_1: return X86::JG_4;
+ case X86::JLE_1: return X86::JLE_4;
+ case X86::JL_1: return X86::JL_4;
+ case X86::TAILJMP_1:
+ case X86::JMP_1: return X86::JMP_4;
+ case X86::JNE_1: return X86::JNE_4;
+ case X86::JNO_1: return X86::JNO_4;
+ case X86::JNP_1: return X86::JNP_4;
+ case X86::JNS_1: return X86::JNS_4;
+ case X86::JO_1: return X86::JO_4;
+ case X86::JP_1: return X86::JP_4;
+ case X86::JS_1: return X86::JS_4;
+ }
+}
+
+bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst) const {
+ // Check if this instruction is ever relaxable.
+ if (getRelaxedOpcode(Inst.getOpcode()) == Inst.getOpcode())
+ return false;
+
+ // If so, just assume it can be relaxed. Once we support relaxing more complex
+ // instructions we should check that the instruction actually has symbolic
+ // operands before doing this, but we need to be careful about things like
+ // PCrel.
+ return true;
+}
+
+// FIXME: Can tblgen help at all here to verify there aren't other instructions
+// we can relax?
+void X86AsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
+ // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
+ unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
+
+ if (RelaxedOp == Inst.getOpcode()) {
+ SmallString<256> Tmp;
+ raw_svector_ostream OS(Tmp);
+ Inst.dump_pretty(OS);
+ OS << "\n";
+ report_fatal_error("unexpected instruction to relax: " + OS.str());
+ }
+
+ Res = Inst;
+ Res.setOpcode(RelaxedOp);
+}
+
+/// WriteNopData - Write optimal nops to the output file for the \arg Count
+/// bytes. This returns the number of bytes written. It may return 0 if
+/// the \arg Count is more than the maximum optimal nops.
+///
+/// FIXME this is X86 32-bit specific and should move to a better place.
+bool X86AsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
+ static const uint8_t Nops[16][16] = {
+ // nop
+ {0x90},
+ // xchg %ax,%ax
+ {0x66, 0x90},
+ // nopl (%[re]ax)
+ {0x0f, 0x1f, 0x00},
+ // nopl 0(%[re]ax)
+ {0x0f, 0x1f, 0x40, 0x00},
+ // nopl 0(%[re]ax,%[re]ax,1)
+ {0x0f, 0x1f, 0x44, 0x00, 0x00},
+ // nopw 0(%[re]ax,%[re]ax,1)
+ {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+ // nopl 0L(%[re]ax)
+ {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+ // nopl 0L(%[re]ax,%[re]ax,1)
+ {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+ // nopw 0L(%[re]ax,%[re]ax,1)
+ {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+ // nopw %cs:0L(%[re]ax,%[re]ax,1)
+ {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+ // nopl 0(%[re]ax,%[re]ax,1)
+ // nopw 0(%[re]ax,%[re]ax,1)
+ {0x0f, 0x1f, 0x44, 0x00, 0x00,
+ 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+ // nopw 0(%[re]ax,%[re]ax,1)
+ // nopw 0(%[re]ax,%[re]ax,1)
+ {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,
+ 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+ // nopw 0(%[re]ax,%[re]ax,1)
+ // nopl 0L(%[re]ax) */
+ {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,
+ 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+ // nopl 0L(%[re]ax)
+ // nopl 0L(%[re]ax)
+ {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00,
+ 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+ // nopl 0L(%[re]ax)
+ // nopl 0L(%[re]ax,%[re]ax,1)
+ {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00,
+ 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}
+ };
+
+ // Write an optimal sequence for the first 15 bytes.
+ uint64_t OptimalCount = (Count < 16) ? Count : 15;
+ for (uint64_t i = 0, e = OptimalCount; i != e; i++)
+ OW->Write8(Nops[OptimalCount - 1][i]);
+
+ // Finish with single byte nops.
+ for (uint64_t i = OptimalCount, e = Count; i != e; ++i)
+ OW->Write8(0x90);
+
+ return true;
+}
+
+/* *** */
+
+class ELFX86AsmBackend : public X86AsmBackend {
+public:
+ ELFX86AsmBackend(const Target &T)
+ : X86AsmBackend(T) {
+ HasAbsolutizedSet = true;
+ HasScatteredSymbols = true;
+ }
+
+ MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+ return 0;
+ }
+
+ bool isVirtualSection(const MCSection &Section) const {
+ const MCSectionELF &SE = static_cast<const MCSectionELF&>(Section);
+ return SE.getType() == MCSectionELF::SHT_NOBITS;;
+ }
+};
+
+class ELFX86_32AsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_32AsmBackend(const Target &T)
+ : ELFX86AsmBackend(T) {}
+};
+
+class ELFX86_64AsmBackend : public ELFX86AsmBackend {
+public:
+ ELFX86_64AsmBackend(const Target &T)
+ : ELFX86AsmBackend(T) {}
+};
+
+class DarwinX86AsmBackend : public X86AsmBackend {
+public:
+ DarwinX86AsmBackend(const Target &T)
+ : X86AsmBackend(T) {
+ HasAbsolutizedSet = true;
+ HasScatteredSymbols = true;
+ }
+
+ bool isVirtualSection(const MCSection &Section) const {
+ const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
+ return (SMO.getType() == MCSectionMachO::S_ZEROFILL ||
+ SMO.getType() == MCSectionMachO::S_GB_ZEROFILL ||
+ SMO.getType() == MCSectionMachO::S_THREAD_LOCAL_ZEROFILL);
+ }
+};
+
+class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
+public:
+ DarwinX86_32AsmBackend(const Target &T)
+ : DarwinX86AsmBackend(T) {}
+
+ MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+ return new MachObjectWriter(OS, /*Is64Bit=*/false);
+ }
+};
+
+class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
+public:
+ DarwinX86_64AsmBackend(const Target &T)
+ : DarwinX86AsmBackend(T) {
+ HasReliableSymbolDifference = true;
+ }
+
+ MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+ return new MachObjectWriter(OS, /*Is64Bit=*/true);
+ }
+
+ virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+ // Temporary labels in the string literals sections require symbols. The
+ // issue is that the x86_64 relocation format does not allow symbol +
+ // offset, and so the linker does not have enough information to resolve the
+ // access to the appropriate atom unless an external relocation is used. For
+ // non-cstring sections, we expect the compiler to use a non-temporary label
+ // for anything that could have an addend pointing outside the symbol.
+ //
+ // See <rdar://problem/4765733>.
+ const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
+ return SMO.getType() == MCSectionMachO::S_CSTRING_LITERALS;
+ }
+
+ virtual bool isSectionAtomizable(const MCSection &Section) const {
+ const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
+ // Fixed sized data sections are uniqued, they cannot be diced into atoms.
+ switch (SMO.getType()) {
+ default:
+ return true;
+
+ case MCSectionMachO::S_4BYTE_LITERALS:
+ case MCSectionMachO::S_8BYTE_LITERALS:
+ case MCSectionMachO::S_16BYTE_LITERALS:
+ case MCSectionMachO::S_LITERAL_POINTERS:
+ case MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS:
+ case MCSectionMachO::S_LAZY_SYMBOL_POINTERS:
+ case MCSectionMachO::S_MOD_INIT_FUNC_POINTERS:
+ case MCSectionMachO::S_MOD_TERM_FUNC_POINTERS:
+ case MCSectionMachO::S_INTERPOSING:
+ return false;
+ }
+ }
+};
+
+}
+
+TargetAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
+ const std::string &TT) {
+ switch (Triple(TT).getOS()) {
+ case Triple::Darwin:
+ return new DarwinX86_32AsmBackend(T);
+ default:
+ return new ELFX86_32AsmBackend(T);
+ }
+}
+
+TargetAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
+ const std::string &TT) {
+ switch (Triple(TT).getOS()) {
+ case Triple::Darwin:
+ return new DarwinX86_64AsmBackend(T);
+ default:
+ return new ELFX86_64AsmBackend(T);
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86COFF.h b/contrib/llvm/lib/Target/X86/X86COFF.h
new file mode 100644
index 0000000..0a8e4e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86COFF.h
@@ -0,0 +1,95 @@
+//===--- X86COFF.h - Some definitions from COFF documentations ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file just defines some symbols found in COFF documentation. They are
+// used to emit function type information for COFF targets (Cygwin/Mingw32).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86COFF_H
+#define X86COFF_H
+
+namespace COFF
+{
+/// Storage class tells where and what the symbol represents
+enum StorageClass {
+ C_EFCN = -1, ///< Physical end of function
+ C_NULL = 0, ///< No symbol
+ C_AUTO = 1, ///< External definition
+ C_EXT = 2, ///< External symbol
+ C_STAT = 3, ///< Static
+ C_REG = 4, ///< Register variable
+ C_EXTDEF = 5, ///< External definition
+ C_LABEL = 6, ///< Label
+ C_ULABEL = 7, ///< Undefined label
+ C_MOS = 8, ///< Member of structure
+ C_ARG = 9, ///< Function argument
+ C_STRTAG = 10, ///< Structure tag
+ C_MOU = 11, ///< Member of union
+ C_UNTAG = 12, ///< Union tag
+ C_TPDEF = 13, ///< Type definition
+ C_USTATIC = 14, ///< Undefined static
+ C_ENTAG = 15, ///< Enumeration tag
+ C_MOE = 16, ///< Member of enumeration
+ C_REGPARM = 17, ///< Register parameter
+ C_FIELD = 18, ///< Bit field
+
+ C_BLOCK = 100, ///< ".bb" or ".eb" - beginning or end of block
+ C_FCN = 101, ///< ".bf" or ".ef" - beginning or end of function
+ C_EOS = 102, ///< End of structure
+ C_FILE = 103, ///< File name
+ C_LINE = 104, ///< Line number, reformatted as symbol
+ C_ALIAS = 105, ///< Duplicate tag
+ C_HIDDEN = 106 ///< External symbol in dmert public lib
+};
+
+/// The type of the symbol. This is made up of a base type and a derived type.
+/// For example, pointer to int is "pointer to T" and "int"
+enum SymbolType {
+ T_NULL = 0, ///< No type info
+ T_ARG = 1, ///< Void function argument (only used by compiler)
+ T_VOID = 1, ///< The same as above. Just named differently in some specs.
+ T_CHAR = 2, ///< Character
+ T_SHORT = 3, ///< Short integer
+ T_INT = 4, ///< Integer
+ T_LONG = 5, ///< Long integer
+ T_FLOAT = 6, ///< Floating point
+ T_DOUBLE = 7, ///< Double word
+ T_STRUCT = 8, ///< Structure
+ T_UNION = 9, ///< Union
+ T_ENUM = 10, ///< Enumeration
+ T_MOE = 11, ///< Member of enumeration
+ T_UCHAR = 12, ///< Unsigned character
+ T_USHORT = 13, ///< Unsigned short
+ T_UINT = 14, ///< Unsigned integer
+ T_ULONG = 15 ///< Unsigned long
+};
+
+/// Derived type of symbol
+enum SymbolDerivedType {
+ DT_NON = 0, ///< No derived type
+ DT_PTR = 1, ///< Pointer to T
+ DT_FCN = 2, ///< Function returning T
+ DT_ARY = 3 ///< Array of T
+};
+
+/// Masks for extracting parts of type
+enum SymbolTypeMasks {
+ N_BTMASK = 017, ///< Mask for base type
+ N_TMASK = 060 ///< Mask for derived type
+};
+
+/// Offsets of parts of type
+enum Shifts {
+ N_BTSHFT = 4 ///< Type is formed as (base + derived << N_BTSHIFT)
+};
+
+}
+
+#endif // X86COFF_H
diff --git a/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp
new file mode 100644
index 0000000..4326814
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp
@@ -0,0 +1,20 @@
+//===-- llvm/CodeGen/X86COFFMachineModuleInfo.cpp -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an MMI implementation for X86 COFF (windows) targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86COFFMachineModuleInfo.h"
+using namespace llvm;
+
+
+X86COFFMachineModuleInfo::~X86COFFMachineModuleInfo() {
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h
new file mode 100644
index 0000000..98ab2a6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h
@@ -0,0 +1,46 @@
+//===-- llvm/CodeGen/X86COFFMachineModuleInfo.h -----------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an MMI implementation for X86 COFF (windows) targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86COFF_MACHINEMODULEINFO_H
+#define X86COFF_MACHINEMODULEINFO_H
+
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "X86MachineFunctionInfo.h"
+
+namespace llvm {
+ class X86MachineFunctionInfo;
+ class TargetData;
+
+/// X86COFFMachineModuleInfo - This is a MachineModuleInfoImpl implementation
+/// for X86 COFF targets.
+class X86COFFMachineModuleInfo : public MachineModuleInfoImpl {
+ DenseSet<MCSymbol const *> Externals;
+public:
+ X86COFFMachineModuleInfo(const MachineModuleInfo &) {}
+ virtual ~X86COFFMachineModuleInfo();
+
+ void addExternalFunction(MCSymbol* Symbol) {
+ Externals.insert(Symbol);
+ }
+
+ typedef DenseSet<MCSymbol const *>::const_iterator externals_iterator;
+ externals_iterator externals_begin() const { return Externals.begin(); }
+ externals_iterator externals_end() const { return Externals.end(); }
+};
+
+
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td
new file mode 100644
index 0000000..a5774e1
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td
@@ -0,0 +1,358 @@
+//===- X86CallingConv.td - Calling Conventions X86 32/64 ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the X86-32 and X86-64
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Return-value conventions common to all X86 CC's.
+def RetCC_X86Common : CallingConv<[
+ // Scalar values are returned in AX first, then DX. For i8, the ABI
+ // requires the values to be in AL and AH, however this code uses AL and DL
+ // instead. This is because using AH for the second register conflicts with
+ // the way LLVM does multiple return values -- a return of {i16,i8} would end
+ // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
+ // for functions that return two i8 values are currently expected to pack the
+ // values into an i16 (which uses AX, and thus AL:AH).
+ CCIfType<[i8] , CCAssignToReg<[AL, DL]>>,
+ CCIfType<[i16], CCAssignToReg<[AX, DX]>>,
+ CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>,
+ CCIfType<[i64], CCAssignToReg<[RAX, RDX]>>,
+
+ // Vector types are returned in XMM0 and XMM1, when they fit. XMMM2 and XMM3
+ // can only be used by ABI non-compliant code. If the target doesn't have XMM
+ // registers, it won't have vector types.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+ // MMX vector types are always returned in MM0. If the target doesn't have
+ // MM0, it doesn't support these vector types.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32], CCAssignToReg<[MM0]>>,
+
+ // Long double types are always returned in ST0 (even with SSE).
+ CCIfType<[f80], CCAssignToReg<[ST0, ST1]>>
+]>;
+
+// X86-32 C return-value convention.
+def RetCC_X86_32_C : CallingConv<[
+ // The X86-32 calling convention returns FP values in ST0, unless marked
+ // with "inreg" (used here to distinguish one kind of reg from another,
+ // weirdly; this is really the sse-regparm calling convention) in which
+ // case they use XMM0, otherwise it is the same as the common X86 calling
+ // conv.
+ CCIfInReg<CCIfSubtarget<"hasSSE2()",
+ CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+ CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 FastCC return-value convention.
+def RetCC_X86_32_Fast : CallingConv<[
+ // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has
+ // SSE2.
+ // This can happen when a float, 2 x float, or 3 x float vector is split by
+ // target lowering, and is returned in 1-3 sse regs.
+ CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+ CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+
+ // For integers, ECX can be used as an extra return register
+ CCIfType<[i8], CCAssignToReg<[AL, DL, CL]>>,
+ CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+ CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+
+ // Otherwise, it is the same as the common X86 calling convention.
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-64 C return-value convention.
+def RetCC_X86_64_C : CallingConv<[
+ // The X86-64 calling convention always returns FP values in XMM0.
+ CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
+ CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
+
+ // MMX vector types are always returned in XMM0 except for v1i64 which is
+ // returned in RAX. This disagrees with ABI documentation but is bug
+ // compatible with gcc.
+ CCIfType<[v1i64], CCAssignToReg<[RAX]>>,
+ CCIfType<[v8i8, v4i16, v2i32, v2f32], CCAssignToReg<[XMM0, XMM1]>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-Win64 C return-value convention.
+def RetCC_X86_Win64_C : CallingConv<[
+ // The X86-Win64 calling convention always returns __m64 values in RAX.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64], CCBitConvertToType<i64>>,
+
+ // And FP in XMM0 only.
+ CCIfType<[f32], CCAssignToReg<[XMM0]>>,
+ CCIfType<[f64], CCAssignToReg<[XMM0]>>,
+
+ // Otherwise, everything is the same as 'normal' X86-64 C CC.
+ CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+
+// This is the root return-value convention for the X86-32 backend.
+def RetCC_X86_32 : CallingConv<[
+ // If FastCC, use RetCC_X86_32_Fast.
+ CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+ // Otherwise, use RetCC_X86_32_C.
+ CCDelegateTo<RetCC_X86_32_C>
+]>;
+
+// This is the root return-value convention for the X86-64 backend.
+def RetCC_X86_64 : CallingConv<[
+ // Mingw64 and native Win64 use Win64 CC
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
+
+ // Otherwise, drop to normal X86-64 CC
+ CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// This is the return-value convention used for the entire X86 backend.
+def RetCC_X86 : CallingConv<[
+ CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
+ CCDelegateTo<RetCC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def CC_X86_64_C : CallingConv<[
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<8, 8>>,
+
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // The first 6 v1i64 vector arguments are passed in GPRs on Darwin.
+ CCIfType<[v1i64],
+ CCIfSubtarget<"isTargetDarwin()",
+ CCBitConvertToType<i64>>>,
+
+ // The first 6 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
+ CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+
+ // The first 8 MMX (except for v1i64) vector arguments are passed in XMM
+ // registers on Darwin.
+ CCIfType<[v8i8, v4i16, v2i32, v2f32],
+ CCIfSubtarget<"isTargetDarwin()",
+ CCIfSubtarget<"hasSSE2()",
+ CCPromoteToType<v2i64>>>>,
+
+ // The first 8 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()",
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+ // Long doubles get stack slots whose size and alignment depends on the
+ // subtarget.
+ CCIfType<[f80], CCAssignToStack<0, 0>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32], CCAssignToStack<8, 8>>
+]>;
+
+// Calling convention used on Win64
+def CC_X86_Win64_C : CallingConv<[
+ // FIXME: Handle byval stuff.
+ // FIXME: Handle varargs.
+
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // 128 bit vectors are passed by pointer
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
+
+ // The first 4 MMX vector arguments are passed in GPRs.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32],
+ CCBitConvertToType<i64>>,
+
+ // The first 4 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+ CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ],
+ [XMM0, XMM1, XMM2, XMM3]>>,
+
+ // The first 4 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
+ [RCX , RDX , R8 , R9 ]>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+ // Long doubles get stack slots whose size and alignment depends on the
+ // subtarget.
+ CCIfType<[f80], CCAssignToStack<0, 0>>,
+
+ // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_X86_64_GHC : CallingConv<[
+ // Promote i8/i16/i32 arguments to i64.
+ CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+ // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+ CCIfType<[i64],
+ CCAssignToReg<[R13, RBP, R12, RBX, R14, RSI, RDI, R8, R9, R15]>>,
+
+ // Pass in STG registers: F1, F2, F3, F4, D1, D2
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfSubtarget<"hasSSE1()",
+ CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
+/// values are spilled on the stack, and the first 4 vector values go in XMM
+/// regs.
+def CC_X86_32_Common : CallingConv<[
+ // Handles byval parameters.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // The first 3 float or double arguments, if marked 'inreg' and if the call
+ // is not a vararg call and if SSE2 is available, are passed in SSE registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64],
+ CCIfSubtarget<"hasSSE2()",
+ CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
+
+ // The first 3 __m64 (except for v1i64) vector arguments are passed in mmx
+ // registers if the call is not a vararg call.
+ CCIfNotVarArg<CCIfType<[v8i8, v4i16, v2i32, v2f32],
+ CCAssignToReg<[MM0, MM1, MM2]>>>,
+
+ // Integer/Float values get stored in stack slots that are 4 bytes in
+ // size and 4-byte aligned.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+ // Doubles get 8-byte slots that are 4-byte aligned.
+ CCIfType<[f64], CCAssignToStack<8, 4>>,
+
+ // Long doubles get slots whose size depends on the subtarget.
+ CCIfType<[f80], CCAssignToStack<0, 4>>,
+
+ // The first 4 SSE vector arguments are passed in XMM registers.
+ CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
+
+ // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
+ // passed in the parameter area.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 4>>]>;
+
+def CC_X86_32_C : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in ECX.
+ CCIfNest<CCAssignToReg<[ECX]>>,
+
+ // The first 3 integer arguments, if marked 'inreg' and if the call is not
+ // a vararg call, are passed in integer registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCall : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in EAX.
+ CCIfNest<CCAssignToReg<[EAX]>>,
+
+ // The first 2 integer arguments are passed in ECX/EDX
+ CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_ThisCall : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in EAX.
+ CCIfNest<CCAssignToReg<[EAX]>>,
+
+ // The first integer argument is passed in ECX
+ CCIfType<[i32], CCAssignToReg<[ECX]>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCC : CallingConv<[
+ // Handles byval parameters. Note that we can't rely on the delegation
+ // to CC_X86_32_Common for this because that happens after code that
+ // puts arguments in registers.
+ CCIfByVal<CCPassByVal<4, 4>>,
+
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in EAX.
+ CCIfNest<CCAssignToReg<[EAX]>>,
+
+ // The first 2 integer arguments are passed in ECX/EDX
+ CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+ // The first 3 float or double arguments, if the call is not a vararg
+ // call and if SSE2 is available, are passed in SSE registers.
+ CCIfNotVarArg<CCIfType<[f32,f64],
+ CCIfSubtarget<"hasSSE2()",
+ CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+
+ // Doubles get 8-byte slots that are 8-byte aligned.
+ CCIfType<[f64], CCAssignToStack<8, 8>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_GHC : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // Pass in STG registers: Base, Sp, Hp, R1
+ CCIfType<[i32], CCAssignToReg<[EBX, EBP, EDI, ESI]>>
+]>;
diff --git a/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp
new file mode 100644
index 0000000..8f02604
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp
@@ -0,0 +1,887 @@
+//===-- X86/X86CodeEmitter.cpp - Convert X86 code to machine code ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the X86 machine instructions into
+// relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-emitter"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "X86Relocations.h"
+#include "X86.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+ template<class CodeEmitter>
+ class Emitter : public MachineFunctionPass {
+ const X86InstrInfo *II;
+ const TargetData *TD;
+ X86TargetMachine &TM;
+ CodeEmitter &MCE;
+ MachineModuleInfo *MMI;
+ intptr_t PICBaseOffset;
+ bool Is64BitMode;
+ bool IsPIC;
+ public:
+ static char ID;
+ explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
+ : MachineFunctionPass(&ID), II(0), TD(0), TM(tm),
+ MCE(mce), PICBaseOffset(0), Is64BitMode(false),
+ IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+ Emitter(X86TargetMachine &tm, CodeEmitter &mce,
+ const X86InstrInfo &ii, const TargetData &td, bool is64)
+ : MachineFunctionPass(&ID), II(&ii), TD(&td), TM(tm),
+ MCE(mce), PICBaseOffset(0), Is64BitMode(is64),
+ IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+
+ bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const {
+ return "X86 Machine Code Emitter";
+ }
+
+ void emitInstruction(const MachineInstr &MI,
+ const TargetInstrDesc *Desc);
+
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ AU.addRequired<MachineModuleInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ private:
+ void emitPCRelativeBlockAddress(MachineBasicBlock *MBB);
+ void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+ intptr_t Disp = 0, intptr_t PCAdj = 0,
+ bool Indirect = false);
+ void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
+ void emitConstPoolAddress(unsigned CPI, unsigned Reloc, intptr_t Disp = 0,
+ intptr_t PCAdj = 0);
+ void emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+ intptr_t PCAdj = 0);
+
+ void emitDisplacementField(const MachineOperand *RelocOp, int DispVal,
+ intptr_t Adj = 0, bool IsPCRel = true);
+
+ void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField);
+ void emitRegModRMByte(unsigned RegOpcodeField);
+ void emitSIBByte(unsigned SS, unsigned Index, unsigned Base);
+ void emitConstant(uint64_t Val, unsigned Size);
+
+ void emitMemModRMByte(const MachineInstr &MI,
+ unsigned Op, unsigned RegOpcodeField,
+ intptr_t PCAdj = 0);
+
+ unsigned getX86RegNum(unsigned RegNo) const;
+ };
+
+template<class CodeEmitter>
+ char Emitter<CodeEmitter>::ID = 0;
+} // end anonymous namespace.
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified templated MachineCodeEmitter object.
+FunctionPass *llvm::createX86JITCodeEmitterPass(X86TargetMachine &TM,
+ JITCodeEmitter &JCE) {
+ return new Emitter<JITCodeEmitter>(TM, JCE);
+}
+
+template<class CodeEmitter>
+bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
+ MMI = &getAnalysis<MachineModuleInfo>();
+ MCE.setModuleInfo(MMI);
+
+ II = TM.getInstrInfo();
+ TD = TM.getTargetData();
+ Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit();
+ IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+
+ do {
+ DEBUG(dbgs() << "JITTing function '"
+ << MF.getFunction()->getName() << "'\n");
+ MCE.startFunction(MF);
+ for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+ MBB != E; ++MBB) {
+ MCE.StartMachineBasicBlock(MBB);
+ for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+ I != E; ++I) {
+ const TargetInstrDesc &Desc = I->getDesc();
+ emitInstruction(*I, &Desc);
+ // MOVPC32r is basically a call plus a pop instruction.
+ if (Desc.getOpcode() == X86::MOVPC32r)
+ emitInstruction(*I, &II->get(X86::POP32r));
+ NumEmitted++; // Keep track of the # of mi's emitted
+ }
+ }
+ } while (MCE.finishFunction(MF));
+
+ return false;
+}
+
+/// emitPCRelativeBlockAddress - This method keeps track of the information
+/// necessary to resolve the address of this block later and emits a dummy
+/// value.
+///
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) {
+ // Remember where this reference was and where it is to so we can
+ // deal with it later.
+ MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+ X86::reloc_pcrel_word, MBB));
+ MCE.emitWordLE(0);
+}
+
+/// emitGlobalAddress - Emit the specified address to the code stream assuming
+/// this is part of a "take the address of a global" instruction.
+///
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitGlobalAddress(const GlobalValue *GV,
+ unsigned Reloc,
+ intptr_t Disp /* = 0 */,
+ intptr_t PCAdj /* = 0 */,
+ bool Indirect /* = false */) {
+ intptr_t RelocCST = Disp;
+ if (Reloc == X86::reloc_picrel_word)
+ RelocCST = PICBaseOffset;
+ else if (Reloc == X86::reloc_pcrel_word)
+ RelocCST = PCAdj;
+ MachineRelocation MR = Indirect
+ ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc,
+ const_cast<GlobalValue *>(GV),
+ RelocCST, false)
+ : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+ const_cast<GlobalValue *>(GV), RelocCST, false);
+ MCE.addRelocation(MR);
+ // The relocated value will be added to the displacement
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitDWordLE(Disp);
+ else
+ MCE.emitWordLE((int32_t)Disp);
+}
+
+/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitExternalSymbolAddress(const char *ES,
+ unsigned Reloc) {
+ intptr_t RelocCST = (Reloc == X86::reloc_picrel_word) ? PICBaseOffset : 0;
+
+ // X86 never needs stubs because instruction selection will always pick
+ // an instruction sequence that is large enough to hold any address
+ // to a symbol.
+ // (see X86ISelLowering.cpp, near 2039: X86TargetLowering::LowerCall)
+ bool NeedStub = false;
+ MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+ Reloc, ES, RelocCST,
+ 0, NeedStub));
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitDWordLE(0);
+ else
+ MCE.emitWordLE(0);
+}
+
+/// emitConstPoolAddress - Arrange for the address of an constant pool
+/// to be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI, unsigned Reloc,
+ intptr_t Disp /* = 0 */,
+ intptr_t PCAdj /* = 0 */) {
+ intptr_t RelocCST = 0;
+ if (Reloc == X86::reloc_picrel_word)
+ RelocCST = PICBaseOffset;
+ else if (Reloc == X86::reloc_pcrel_word)
+ RelocCST = PCAdj;
+ MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+ Reloc, CPI, RelocCST));
+ // The relocated value will be added to the displacement
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitDWordLE(Disp);
+ else
+ MCE.emitWordLE((int32_t)Disp);
+}
+
+/// emitJumpTableAddress - Arrange for the address of a jump table to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+ intptr_t PCAdj /* = 0 */) {
+ intptr_t RelocCST = 0;
+ if (Reloc == X86::reloc_picrel_word)
+ RelocCST = PICBaseOffset;
+ else if (Reloc == X86::reloc_pcrel_word)
+ RelocCST = PCAdj;
+ MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+ Reloc, JTI, RelocCST));
+ // The relocated value will be added to the displacement
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitDWordLE(0);
+ else
+ MCE.emitWordLE(0);
+}
+
+template<class CodeEmitter>
+unsigned Emitter<CodeEmitter>::getX86RegNum(unsigned RegNo) const {
+ return X86RegisterInfo::getX86RegNum(RegNo);
+}
+
+inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+ unsigned RM) {
+ assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+ return RM | (RegOpcode << 3) | (Mod << 6);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg,
+ unsigned RegOpcodeFld){
+ MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) {
+ MCE.emitByte(ModRMByte(3, RegOpcodeFld, 0));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitSIBByte(unsigned SS,
+ unsigned Index,
+ unsigned Base) {
+ // SIB byte is in the same format as the ModRMByte...
+ MCE.emitByte(ModRMByte(SS, Index, Base));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) {
+ // Output the constant in little endian byte order...
+ for (unsigned i = 0; i != Size; ++i) {
+ MCE.emitByte(Val & 255);
+ Val >>= 8;
+ }
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
+static bool isDisp8(int Value) {
+ return Value == (signed char)Value;
+}
+
+static bool gvNeedsNonLazyPtr(const MachineOperand &GVOp,
+ const TargetMachine &TM) {
+ // For Darwin-64, simulate the linktime GOT by using the same non-lazy-pointer
+ // mechanism as 32-bit mode.
+ if (TM.getSubtarget<X86Subtarget>().is64Bit() &&
+ !TM.getSubtarget<X86Subtarget>().isTargetDarwin())
+ return false;
+
+ // Return true if this is a reference to a stub containing the address of the
+ // global, not the global itself.
+ return isGlobalStubReference(GVOp.getTargetFlags());
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp,
+ int DispVal,
+ intptr_t Adj /* = 0 */,
+ bool IsPCRel /* = true */) {
+ // If this is a simple integer displacement that doesn't require a relocation,
+ // emit it now.
+ if (!RelocOp) {
+ emitConstant(DispVal, 4);
+ return;
+ }
+
+ // Otherwise, this is something that requires a relocation. Emit it as such
+ // now.
+ unsigned RelocType = Is64BitMode ?
+ (IsPCRel ? X86::reloc_pcrel_word : X86::reloc_absolute_word_sext)
+ : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+ if (RelocOp->isGlobal()) {
+ // In 64-bit static small code model, we could potentially emit absolute.
+ // But it's probably not beneficial. If the MCE supports using RIP directly
+ // do it, otherwise fallback to absolute (this is determined by IsPCRel).
+ // 89 05 00 00 00 00 mov %eax,0(%rip) # PC-relative
+ // 89 04 25 00 00 00 00 mov %eax,0x0 # Absolute
+ bool Indirect = gvNeedsNonLazyPtr(*RelocOp, TM);
+ emitGlobalAddress(RelocOp->getGlobal(), RelocType, RelocOp->getOffset(),
+ Adj, Indirect);
+ } else if (RelocOp->isSymbol()) {
+ emitExternalSymbolAddress(RelocOp->getSymbolName(), RelocType);
+ } else if (RelocOp->isCPI()) {
+ emitConstPoolAddress(RelocOp->getIndex(), RelocType,
+ RelocOp->getOffset(), Adj);
+ } else {
+ assert(RelocOp->isJTI() && "Unexpected machine operand!");
+ emitJumpTableAddress(RelocOp->getIndex(), RelocType, Adj);
+ }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
+ unsigned Op,unsigned RegOpcodeField,
+ intptr_t PCAdj) {
+ const MachineOperand &Op3 = MI.getOperand(Op+3);
+ int DispVal = 0;
+ const MachineOperand *DispForReloc = 0;
+
+ // Figure out what sort of displacement we have to handle here.
+ if (Op3.isGlobal()) {
+ DispForReloc = &Op3;
+ } else if (Op3.isSymbol()) {
+ DispForReloc = &Op3;
+ } else if (Op3.isCPI()) {
+ if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) {
+ DispForReloc = &Op3;
+ } else {
+ DispVal += MCE.getConstantPoolEntryAddress(Op3.getIndex());
+ DispVal += Op3.getOffset();
+ }
+ } else if (Op3.isJTI()) {
+ if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) {
+ DispForReloc = &Op3;
+ } else {
+ DispVal += MCE.getJumpTableEntryAddress(Op3.getIndex());
+ }
+ } else {
+ DispVal = Op3.getImm();
+ }
+
+ const MachineOperand &Base = MI.getOperand(Op);
+ const MachineOperand &Scale = MI.getOperand(Op+1);
+ const MachineOperand &IndexReg = MI.getOperand(Op+2);
+
+ unsigned BaseReg = Base.getReg();
+
+ // Handle %rip relative addressing.
+ if (BaseReg == X86::RIP ||
+ (Is64BitMode && DispForReloc)) { // [disp32+RIP] in X86-64 mode
+ assert(IndexReg.getReg() == 0 && Is64BitMode &&
+ "Invalid rip-relative address");
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
+ emitDisplacementField(DispForReloc, DispVal, PCAdj, true);
+ return;
+ }
+
+ // Indicate that the displacement will use an pcrel or absolute reference
+ // by default. MCEs able to resolve addresses on-the-fly use pcrel by default
+ // while others, unless explicit asked to use RIP, use absolute references.
+ bool IsPCRel = MCE.earlyResolveAddresses() ? true : false;
+
+ // Is a SIB byte needed?
+ // If no BaseReg, issue a RIP relative instruction only if the MCE can
+ // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+ // 2-7) and absolute references.
+ unsigned BaseRegNo = -1U;
+ if (BaseReg != 0 && BaseReg != X86::RIP)
+ BaseRegNo = getX86RegNum(BaseReg);
+
+ if (// The SIB byte must be used if there is an index register.
+ IndexReg.getReg() == 0 &&
+ // The SIB byte must be used if the base is ESP/RSP/R12, all of which
+ // encode to an R/M value of 4, which indicates that a SIB byte is
+ // present.
+ BaseRegNo != N86::ESP &&
+ // If there is no base register and we're in 64-bit mode, we need a SIB
+ // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+ (!Is64BitMode || BaseReg != 0)) {
+ if (BaseReg == 0 || // [disp32] in X86-32 mode
+ BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
+ emitDisplacementField(DispForReloc, DispVal, PCAdj, true);
+ return;
+ }
+
+ // If the base is not EBP/ESP and there is no displacement, use simple
+ // indirect register encoding, this handles addresses like [EAX]. The
+ // encoding for [EBP] with no displacement means [disp32] so we handle it
+ // by emitting a displacement of 0 below.
+ if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo));
+ return;
+ }
+
+ // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+ if (!DispForReloc && isDisp8(DispVal)) {
+ MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo));
+ emitConstant(DispVal, 1);
+ return;
+ }
+
+ // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+ MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
+ emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
+ return;
+ }
+
+ // Otherwise we need a SIB byte, so start by outputting the ModR/M byte first.
+ assert(IndexReg.getReg() != X86::ESP &&
+ IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+ bool ForceDisp32 = false;
+ bool ForceDisp8 = false;
+ if (BaseReg == 0) {
+ // If there is no base register, we emit the special case SIB byte with
+ // MOD=0, BASE=4, to JUST get the index, scale, and displacement.
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+ ForceDisp32 = true;
+ } else if (DispForReloc) {
+ // Emit the normal disp32 encoding.
+ MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+ ForceDisp32 = true;
+ } else if (DispVal == 0 && BaseRegNo != N86::EBP) {
+ // Emit no displacement ModR/M byte
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+ } else if (isDisp8(DispVal)) {
+ // Emit the disp8 encoding...
+ MCE.emitByte(ModRMByte(1, RegOpcodeField, 4));
+ ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ } else {
+ // Emit the normal disp32 encoding...
+ MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+ }
+
+ // Calculate what the SS field value should be...
+ static const unsigned SSTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 };
+ unsigned SS = SSTable[Scale.getImm()];
+
+ if (BaseReg == 0) {
+ // Handle the SIB byte for the case where there is no base, see Intel
+ // Manual 2A, table 2-7. The displacement has already been output.
+ unsigned IndexRegNo;
+ if (IndexReg.getReg())
+ IndexRegNo = getX86RegNum(IndexReg.getReg());
+ else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+ IndexRegNo = 4;
+ emitSIBByte(SS, IndexRegNo, 5);
+ } else {
+ unsigned BaseRegNo = getX86RegNum(BaseReg);
+ unsigned IndexRegNo;
+ if (IndexReg.getReg())
+ IndexRegNo = getX86RegNum(IndexReg.getReg());
+ else
+ IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
+ emitSIBByte(SS, IndexRegNo, BaseRegNo);
+ }
+
+ // Do we need to output a displacement?
+ if (ForceDisp8) {
+ emitConstant(DispVal, 1);
+ } else if (DispVal != 0 || ForceDisp32) {
+ emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
+ }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI,
+ const TargetInstrDesc *Desc) {
+ DEBUG(dbgs() << MI);
+
+ MCE.processDebugLoc(MI.getDebugLoc(), true);
+
+ unsigned Opcode = Desc->Opcode;
+
+ // Emit the lock opcode prefix as needed.
+ if (Desc->TSFlags & X86II::LOCK)
+ MCE.emitByte(0xF0);
+
+ // Emit segment override opcode prefix as needed.
+ switch (Desc->TSFlags & X86II::SegOvrMask) {
+ case X86II::FS:
+ MCE.emitByte(0x64);
+ break;
+ case X86II::GS:
+ MCE.emitByte(0x65);
+ break;
+ default: llvm_unreachable("Invalid segment!");
+ case 0: break; // No segment override!
+ }
+
+ // Emit the repeat opcode prefix as needed.
+ if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP)
+ MCE.emitByte(0xF3);
+
+ // Emit the operand size opcode prefix as needed.
+ if (Desc->TSFlags & X86II::OpSize)
+ MCE.emitByte(0x66);
+
+ // Emit the address size opcode prefix as needed.
+ if (Desc->TSFlags & X86II::AdSize)
+ MCE.emitByte(0x67);
+
+ bool Need0FPrefix = false;
+ switch (Desc->TSFlags & X86II::Op0Mask) {
+ case X86II::TB: // Two-byte opcode prefix
+ case X86II::T8: // 0F 38
+ case X86II::TA: // 0F 3A
+ Need0FPrefix = true;
+ break;
+ case X86II::TF: // F2 0F 38
+ MCE.emitByte(0xF2);
+ Need0FPrefix = true;
+ break;
+ case X86II::REP: break; // already handled.
+ case X86II::XS: // F3 0F
+ MCE.emitByte(0xF3);
+ Need0FPrefix = true;
+ break;
+ case X86II::XD: // F2 0F
+ MCE.emitByte(0xF2);
+ Need0FPrefix = true;
+ break;
+ case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+ case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+ MCE.emitByte(0xD8+
+ (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
+ >> X86II::Op0Shift));
+ break; // Two-byte opcode prefix
+ default: llvm_unreachable("Invalid prefix!");
+ case 0: break; // No prefix!
+ }
+
+ // Handle REX prefix.
+ if (Is64BitMode) {
+ if (unsigned REX = X86InstrInfo::determineREX(MI))
+ MCE.emitByte(0x40 | REX);
+ }
+
+ // 0x0F escape code must be emitted just before the opcode.
+ if (Need0FPrefix)
+ MCE.emitByte(0x0F);
+
+ switch (Desc->TSFlags & X86II::Op0Mask) {
+ case X86II::TF: // F2 0F 38
+ case X86II::T8: // 0F 38
+ MCE.emitByte(0x38);
+ break;
+ case X86II::TA: // 0F 3A
+ MCE.emitByte(0x3A);
+ break;
+ }
+
+ // If this is a two-address instruction, skip one of the register operands.
+ unsigned NumOps = Desc->getNumOperands();
+ unsigned CurOp = 0;
+ if (NumOps > 1 && Desc->getOperandConstraint(1, TOI::TIED_TO) != -1)
+ ++CurOp;
+ else if (NumOps > 2 && Desc->getOperandConstraint(NumOps-1, TOI::TIED_TO)== 0)
+ // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
+ --NumOps;
+
+ unsigned char BaseOpcode = X86II::getBaseOpcodeFor(Desc->TSFlags);
+ switch (Desc->TSFlags & X86II::FormMask) {
+ default:
+ llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!");
+ case X86II::Pseudo:
+ // Remember the current PC offset, this is the PIC relocation
+ // base address.
+ switch (Opcode) {
+ default:
+ llvm_unreachable("psuedo instructions should be removed before code"
+ " emission");
+ break;
+ case TargetOpcode::INLINEASM:
+ // We allow inline assembler nodes with empty bodies - they can
+ // implicitly define registers, which is ok for JIT.
+ if (MI.getOperand(0).getSymbolName()[0])
+ report_fatal_error("JIT does not support inline asm!");
+ break;
+ case TargetOpcode::DBG_LABEL:
+ case TargetOpcode::GC_LABEL:
+ case TargetOpcode::EH_LABEL:
+ MCE.emitLabel(MI.getOperand(0).getMCSymbol());
+ break;
+
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::KILL:
+ case X86::FP_REG_KILL:
+ break;
+ case X86::MOVPC32r: {
+ // This emits the "call" portion of this pseudo instruction.
+ MCE.emitByte(BaseOpcode);
+ emitConstant(0, X86II::getSizeOfImm(Desc->TSFlags));
+ // Remember PIC base.
+ PICBaseOffset = (intptr_t) MCE.getCurrentPCOffset();
+ X86JITInfo *JTI = TM.getJITInfo();
+ JTI->setPICBase(MCE.getCurrentPCValue());
+ break;
+ }
+ }
+ CurOp = NumOps;
+ break;
+ case X86II::RawFrm: {
+ MCE.emitByte(BaseOpcode);
+
+ if (CurOp == NumOps)
+ break;
+
+ const MachineOperand &MO = MI.getOperand(CurOp++);
+
+ DEBUG(dbgs() << "RawFrm CurOp " << CurOp << "\n");
+ DEBUG(dbgs() << "isMBB " << MO.isMBB() << "\n");
+ DEBUG(dbgs() << "isGlobal " << MO.isGlobal() << "\n");
+ DEBUG(dbgs() << "isSymbol " << MO.isSymbol() << "\n");
+ DEBUG(dbgs() << "isImm " << MO.isImm() << "\n");
+
+ if (MO.isMBB()) {
+ emitPCRelativeBlockAddress(MO.getMBB());
+ break;
+ }
+
+ if (MO.isGlobal()) {
+ emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word,
+ MO.getOffset(), 0);
+ break;
+ }
+
+ if (MO.isSymbol()) {
+ emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
+ break;
+ }
+
+ // FIXME: Only used by hackish MCCodeEmitter, remove when dead.
+ if (MO.isJTI()) {
+ emitJumpTableAddress(MO.getIndex(), X86::reloc_pcrel_word);
+ break;
+ }
+
+ assert(MO.isImm() && "Unknown RawFrm operand!");
+ if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) {
+ // Fix up immediate operand for pc relative calls.
+ intptr_t Imm = (intptr_t)MO.getImm();
+ Imm = Imm - MCE.getCurrentPCValue() - 4;
+ emitConstant(Imm, X86II::getSizeOfImm(Desc->TSFlags));
+ } else
+ emitConstant(MO.getImm(), X86II::getSizeOfImm(Desc->TSFlags));
+ break;
+ }
+
+ case X86II::AddRegFrm: {
+ MCE.emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++).getReg()));
+
+ if (CurOp == NumOps)
+ break;
+
+ const MachineOperand &MO1 = MI.getOperand(CurOp++);
+ unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+ if (MO1.isImm()) {
+ emitConstant(MO1.getImm(), Size);
+ break;
+ }
+
+ unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+ : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+ if (Opcode == X86::MOV64ri64i32)
+ rt = X86::reloc_absolute_word; // FIXME: add X86II flag?
+ // This should not occur on Darwin for relocatable objects.
+ if (Opcode == X86::MOV64ri)
+ rt = X86::reloc_absolute_dword; // FIXME: add X86II flag?
+ if (MO1.isGlobal()) {
+ bool Indirect = gvNeedsNonLazyPtr(MO1, TM);
+ emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+ Indirect);
+ } else if (MO1.isSymbol())
+ emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+ else if (MO1.isCPI())
+ emitConstPoolAddress(MO1.getIndex(), rt);
+ else if (MO1.isJTI())
+ emitJumpTableAddress(MO1.getIndex(), rt);
+ break;
+ }
+
+ case X86II::MRMDestReg: {
+ MCE.emitByte(BaseOpcode);
+ emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+ getX86RegNum(MI.getOperand(CurOp+1).getReg()));
+ CurOp += 2;
+ if (CurOp != NumOps)
+ emitConstant(MI.getOperand(CurOp++).getImm(),
+ X86II::getSizeOfImm(Desc->TSFlags));
+ break;
+ }
+ case X86II::MRMDestMem: {
+ MCE.emitByte(BaseOpcode);
+ emitMemModRMByte(MI, CurOp,
+ getX86RegNum(MI.getOperand(CurOp + X86AddrNumOperands)
+ .getReg()));
+ CurOp += X86AddrNumOperands + 1;
+ if (CurOp != NumOps)
+ emitConstant(MI.getOperand(CurOp++).getImm(),
+ X86II::getSizeOfImm(Desc->TSFlags));
+ break;
+ }
+
+ case X86II::MRMSrcReg:
+ MCE.emitByte(BaseOpcode);
+ emitRegModRMByte(MI.getOperand(CurOp+1).getReg(),
+ getX86RegNum(MI.getOperand(CurOp).getReg()));
+ CurOp += 2;
+ if (CurOp != NumOps)
+ emitConstant(MI.getOperand(CurOp++).getImm(),
+ X86II::getSizeOfImm(Desc->TSFlags));
+ break;
+
+ case X86II::MRMSrcMem: {
+ // FIXME: Maybe lea should have its own form?
+ int AddrOperands;
+ if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+ Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+ AddrOperands = X86AddrNumOperands - 1; // No segment register
+ else
+ AddrOperands = X86AddrNumOperands;
+
+ intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ?
+ X86II::getSizeOfImm(Desc->TSFlags) : 0;
+
+ MCE.emitByte(BaseOpcode);
+ emitMemModRMByte(MI, CurOp+1, getX86RegNum(MI.getOperand(CurOp).getReg()),
+ PCAdj);
+ CurOp += AddrOperands + 1;
+ if (CurOp != NumOps)
+ emitConstant(MI.getOperand(CurOp++).getImm(),
+ X86II::getSizeOfImm(Desc->TSFlags));
+ break;
+ }
+
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r: {
+ MCE.emitByte(BaseOpcode);
+ emitRegModRMByte(MI.getOperand(CurOp++).getReg(),
+ (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
+
+ if (CurOp == NumOps)
+ break;
+
+ const MachineOperand &MO1 = MI.getOperand(CurOp++);
+ unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+ if (MO1.isImm()) {
+ emitConstant(MO1.getImm(), Size);
+ break;
+ }
+
+ unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+ : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+ if (Opcode == X86::MOV64ri32)
+ rt = X86::reloc_absolute_word_sext; // FIXME: add X86II flag?
+ if (MO1.isGlobal()) {
+ bool Indirect = gvNeedsNonLazyPtr(MO1, TM);
+ emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+ Indirect);
+ } else if (MO1.isSymbol())
+ emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+ else if (MO1.isCPI())
+ emitConstPoolAddress(MO1.getIndex(), rt);
+ else if (MO1.isJTI())
+ emitJumpTableAddress(MO1.getIndex(), rt);
+ break;
+ }
+
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m: {
+ intptr_t PCAdj = (CurOp + X86AddrNumOperands != NumOps) ?
+ (MI.getOperand(CurOp+X86AddrNumOperands).isImm() ?
+ X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0;
+
+ MCE.emitByte(BaseOpcode);
+ emitMemModRMByte(MI, CurOp, (Desc->TSFlags & X86II::FormMask)-X86II::MRM0m,
+ PCAdj);
+ CurOp += X86AddrNumOperands;
+
+ if (CurOp == NumOps)
+ break;
+
+ const MachineOperand &MO = MI.getOperand(CurOp++);
+ unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+ if (MO.isImm()) {
+ emitConstant(MO.getImm(), Size);
+ break;
+ }
+
+ unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+ : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+ if (Opcode == X86::MOV64mi32)
+ rt = X86::reloc_absolute_word_sext; // FIXME: add X86II flag?
+ if (MO.isGlobal()) {
+ bool Indirect = gvNeedsNonLazyPtr(MO, TM);
+ emitGlobalAddress(MO.getGlobal(), rt, MO.getOffset(), 0,
+ Indirect);
+ } else if (MO.isSymbol())
+ emitExternalSymbolAddress(MO.getSymbolName(), rt);
+ else if (MO.isCPI())
+ emitConstPoolAddress(MO.getIndex(), rt);
+ else if (MO.isJTI())
+ emitJumpTableAddress(MO.getIndex(), rt);
+ break;
+ }
+
+ case X86II::MRMInitReg:
+ MCE.emitByte(BaseOpcode);
+ // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
+ emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+ getX86RegNum(MI.getOperand(CurOp).getReg()));
+ ++CurOp;
+ break;
+
+ case X86II::MRM_C1:
+ MCE.emitByte(BaseOpcode);
+ MCE.emitByte(0xC1);
+ break;
+ case X86II::MRM_C8:
+ MCE.emitByte(BaseOpcode);
+ MCE.emitByte(0xC8);
+ break;
+ case X86II::MRM_C9:
+ MCE.emitByte(BaseOpcode);
+ MCE.emitByte(0xC9);
+ break;
+ case X86II::MRM_E8:
+ MCE.emitByte(BaseOpcode);
+ MCE.emitByte(0xE8);
+ break;
+ case X86II::MRM_F0:
+ MCE.emitByte(BaseOpcode);
+ MCE.emitByte(0xF0);
+ break;
+ }
+
+ if (!Desc->isVariadic() && CurOp != NumOps) {
+#ifndef NDEBUG
+ dbgs() << "Cannot encode all operands of: " << MI << "\n";
+#endif
+ llvm_unreachable(0);
+ }
+
+ MCE.processDebugLoc(MI.getDebugLoc(), false);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86CompilationCallback_Win64.asm b/contrib/llvm/lib/Target/X86/X86CompilationCallback_Win64.asm
new file mode 100644
index 0000000..f321778
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CompilationCallback_Win64.asm
@@ -0,0 +1,68 @@
+;;===-- X86CompilationCallback_Win64.asm - Implement Win64 JIT callback ---===
+;;
+;; The LLVM Compiler Infrastructure
+;;
+;; This file is distributed under the University of Illinois Open Source
+;; License. See LICENSE.TXT for details.
+;;
+;;===----------------------------------------------------------------------===
+;;
+;; This file implements the JIT interfaces for the X86 target.
+;;
+;;===----------------------------------------------------------------------===
+
+extrn X86CompilationCallback2: PROC
+
+.code
+X86CompilationCallback proc
+ push rbp
+
+ ; Save RSP.
+ mov rbp, rsp
+
+ ; Save all int arg registers
+ ; WARNING: We cannot use register spill area - we're generating stubs by hands!
+ push rcx
+ push rdx
+ push r8
+ push r9
+
+ ; Align stack on 16-byte boundary.
+ and rsp, -16
+
+ ; Save all XMM arg registers. Also allocate reg spill area.
+ sub rsp, 96
+ movaps [rsp +32], xmm0
+ movaps [rsp+16+32], xmm1
+ movaps [rsp+32+32], xmm2
+ movaps [rsp+48+32], xmm3
+
+ ; JIT callee
+
+ ; Pass prev frame and return address.
+ mov rcx, rbp
+ mov rdx, qword ptr [rbp+8]
+ call X86CompilationCallback2
+
+ ; Restore all XMM arg registers.
+ movaps xmm3, [rsp+48+32]
+ movaps xmm2, [rsp+32+32]
+ movaps xmm1, [rsp+16+32]
+ movaps xmm0, [rsp +32]
+
+ ; Restore RSP.
+ mov rsp, rbp
+
+ ; Restore all int arg registers
+ sub rsp, 32
+ pop r9
+ pop r8
+ pop rdx
+ pop rcx
+
+ ; Restore RBP.
+ pop rbp
+ ret
+X86CompilationCallback endp
+
+End
diff --git a/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.cpp b/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.cpp
new file mode 100644
index 0000000..f84995d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.cpp
@@ -0,0 +1,152 @@
+//===-- X86ELFWriterInfo.cpp - ELF Writer Info for the X86 backend --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the X86 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ELFWriterInfo.h"
+#include "X86Relocations.h"
+#include "llvm/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Implementation of the X86ELFWriterInfo class
+//===----------------------------------------------------------------------===//
+
+X86ELFWriterInfo::X86ELFWriterInfo(TargetMachine &TM)
+ : TargetELFWriterInfo(TM) {
+ EMachine = is64Bit ? EM_X86_64 : EM_386;
+ }
+
+X86ELFWriterInfo::~X86ELFWriterInfo() {}
+
+unsigned X86ELFWriterInfo::getRelocationType(unsigned MachineRelTy) const {
+ if (is64Bit) {
+ switch(MachineRelTy) {
+ case X86::reloc_pcrel_word:
+ return R_X86_64_PC32;
+ case X86::reloc_absolute_word:
+ return R_X86_64_32;
+ case X86::reloc_absolute_word_sext:
+ return R_X86_64_32S;
+ case X86::reloc_absolute_dword:
+ return R_X86_64_64;
+ case X86::reloc_picrel_word:
+ default:
+ llvm_unreachable("unknown x86_64 machine relocation type");
+ }
+ } else {
+ switch(MachineRelTy) {
+ case X86::reloc_pcrel_word:
+ return R_386_PC32;
+ case X86::reloc_absolute_word:
+ return R_386_32;
+ case X86::reloc_absolute_word_sext:
+ case X86::reloc_absolute_dword:
+ case X86::reloc_picrel_word:
+ default:
+ llvm_unreachable("unknown x86 machine relocation type");
+ }
+ }
+ return 0;
+}
+
+long int X86ELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy,
+ long int Modifier) const {
+ if (is64Bit) {
+ switch(RelTy) {
+ case R_X86_64_PC32: return Modifier - 4;
+ case R_X86_64_32:
+ case R_X86_64_32S:
+ case R_X86_64_64:
+ return Modifier;
+ default:
+ llvm_unreachable("unknown x86_64 relocation type");
+ }
+ } else {
+ switch(RelTy) {
+ case R_386_PC32: return Modifier - 4;
+ case R_386_32: return Modifier;
+ default:
+ llvm_unreachable("unknown x86 relocation type");
+ }
+ }
+ return 0;
+}
+
+unsigned X86ELFWriterInfo::getRelocationTySize(unsigned RelTy) const {
+ if (is64Bit) {
+ switch(RelTy) {
+ case R_X86_64_PC32:
+ case R_X86_64_32:
+ case R_X86_64_32S:
+ return 32;
+ case R_X86_64_64:
+ return 64;
+ default:
+ llvm_unreachable("unknown x86_64 relocation type");
+ }
+ } else {
+ switch(RelTy) {
+ case R_386_PC32:
+ case R_386_32:
+ return 32;
+ default:
+ llvm_unreachable("unknown x86 relocation type");
+ }
+ }
+ return 0;
+}
+
+bool X86ELFWriterInfo::isPCRelativeRel(unsigned RelTy) const {
+ if (is64Bit) {
+ switch(RelTy) {
+ case R_X86_64_PC32:
+ return true;
+ case R_X86_64_32:
+ case R_X86_64_32S:
+ case R_X86_64_64:
+ return false;
+ default:
+ llvm_unreachable("unknown x86_64 relocation type");
+ }
+ } else {
+ switch(RelTy) {
+ case R_386_PC32:
+ return true;
+ case R_386_32:
+ return false;
+ default:
+ llvm_unreachable("unknown x86 relocation type");
+ }
+ }
+ return 0;
+}
+
+unsigned X86ELFWriterInfo::getAbsoluteLabelMachineRelTy() const {
+ return is64Bit ?
+ X86::reloc_absolute_dword : X86::reloc_absolute_word;
+}
+
+long int X86ELFWriterInfo::computeRelocation(unsigned SymOffset,
+ unsigned RelOffset,
+ unsigned RelTy) const {
+
+ if (RelTy == R_X86_64_PC32 || RelTy == R_386_PC32)
+ return SymOffset - (RelOffset + 4);
+ else
+ assert("computeRelocation unknown for this relocation type");
+
+ return 0;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.h b/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.h
new file mode 100644
index 0000000..342e6e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.h
@@ -0,0 +1,76 @@
+//===-- X86ELFWriterInfo.h - ELF Writer Info for X86 ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the X86 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ELF_WRITER_INFO_H
+#define X86_ELF_WRITER_INFO_H
+
+#include "llvm/Target/TargetELFWriterInfo.h"
+
+namespace llvm {
+
+ class X86ELFWriterInfo : public TargetELFWriterInfo {
+
+ // ELF Relocation types for X86
+ enum X86RelocationType {
+ R_386_NONE = 0,
+ R_386_32 = 1,
+ R_386_PC32 = 2
+ };
+
+ // ELF Relocation types for X86_64
+ enum X86_64RelocationType {
+ R_X86_64_NONE = 0,
+ R_X86_64_64 = 1,
+ R_X86_64_PC32 = 2,
+ R_X86_64_32 = 10,
+ R_X86_64_32S = 11,
+ R_X86_64_PC64 = 24
+ };
+
+ public:
+ X86ELFWriterInfo(TargetMachine &TM);
+ virtual ~X86ELFWriterInfo();
+
+ /// getRelocationType - Returns the target specific ELF Relocation type.
+ /// 'MachineRelTy' contains the object code independent relocation type
+ virtual unsigned getRelocationType(unsigned MachineRelTy) const;
+
+ /// hasRelocationAddend - True if the target uses an addend in the
+ /// ELF relocation entry.
+ virtual bool hasRelocationAddend() const { return is64Bit ? true : false; }
+
+ /// getDefaultAddendForRelTy - Gets the default addend value for a
+ /// relocation entry based on the target ELF relocation type.
+ virtual long int getDefaultAddendForRelTy(unsigned RelTy,
+ long int Modifier = 0) const;
+
+ /// getRelTySize - Returns the size of relocatable field in bits
+ virtual unsigned getRelocationTySize(unsigned RelTy) const;
+
+ /// isPCRelativeRel - True if the relocation type is pc relative
+ virtual bool isPCRelativeRel(unsigned RelTy) const;
+
+ /// getJumpTableRelocationTy - Returns the machine relocation type used
+ /// to reference a jumptable.
+ virtual unsigned getAbsoluteLabelMachineRelTy() const;
+
+ /// computeRelocation - Some relocatable fields could be relocated
+ /// directly, avoiding the relocation symbol emission, compute the
+ /// final relocation value for this symbol.
+ virtual long int computeRelocation(unsigned SymOffset, unsigned RelOffset,
+ unsigned RelTy) const;
+ };
+
+} // end llvm namespace
+
+#endif // X86_ELF_WRITER_INFO_H
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
new file mode 100644
index 0000000..1bc5eb7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
@@ -0,0 +1,1775 @@
+//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86-specific support for the FastISel class. Much
+// of the target-specific code is generated by tablegen in the file
+// X86GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+
+class X86FastISel : public FastISel {
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ /// StackPtr - Register used as the stack pointer.
+ ///
+ unsigned StackPtr;
+
+ /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
+ /// floating point ops.
+ /// When SSE is available, use it for f32 operations.
+ /// When SSE2 is available, use it for f64 operations.
+ bool X86ScalarSSEf64;
+ bool X86ScalarSSEf32;
+
+public:
+ explicit X86FastISel(MachineFunction &mf,
+ DenseMap<const Value *, unsigned> &vm,
+ DenseMap<const BasicBlock *, MachineBasicBlock *> &bm,
+ DenseMap<const AllocaInst *, int> &am,
+ std::vector<std::pair<MachineInstr*, unsigned> > &pn
+#ifndef NDEBUG
+ , SmallSet<const Instruction *, 8> &cil
+#endif
+ )
+ : FastISel(mf, vm, bm, am, pn
+#ifndef NDEBUG
+ , cil
+#endif
+ ) {
+ Subtarget = &TM.getSubtarget<X86Subtarget>();
+ StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
+ X86ScalarSSEf64 = Subtarget->hasSSE2();
+ X86ScalarSSEf32 = Subtarget->hasSSE1();
+ }
+
+ virtual bool TargetSelectInstruction(const Instruction *I);
+
+#include "X86GenFastISel.inc"
+
+private:
+ bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT);
+
+ bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR);
+
+ bool X86FastEmitStore(EVT VT, const Value *Val,
+ const X86AddressMode &AM);
+ bool X86FastEmitStore(EVT VT, unsigned Val,
+ const X86AddressMode &AM);
+
+ bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
+ unsigned &ResultReg);
+
+ bool X86SelectAddress(const Value *V, X86AddressMode &AM);
+ bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
+
+ bool X86SelectLoad(const Instruction *I);
+
+ bool X86SelectStore(const Instruction *I);
+
+ bool X86SelectCmp(const Instruction *I);
+
+ bool X86SelectZExt(const Instruction *I);
+
+ bool X86SelectBranch(const Instruction *I);
+
+ bool X86SelectShift(const Instruction *I);
+
+ bool X86SelectSelect(const Instruction *I);
+
+ bool X86SelectTrunc(const Instruction *I);
+
+ bool X86SelectFPExt(const Instruction *I);
+ bool X86SelectFPTrunc(const Instruction *I);
+
+ bool X86SelectExtractValue(const Instruction *I);
+
+ bool X86VisitIntrinsicCall(const IntrinsicInst &I);
+ bool X86SelectCall(const Instruction *I);
+
+ CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isTailCall = false);
+
+ const X86InstrInfo *getInstrInfo() const {
+ return getTargetMachine()->getInstrInfo();
+ }
+ const X86TargetMachine *getTargetMachine() const {
+ return static_cast<const X86TargetMachine *>(&TM);
+ }
+
+ unsigned TargetMaterializeConstant(const Constant *C);
+
+ unsigned TargetMaterializeAlloca(const AllocaInst *C);
+
+ /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+ /// computed in an SSE register, not on the X87 floating point stack.
+ bool isScalarFPTypeInSSEReg(EVT VT) const {
+ return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+ (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
+ }
+
+ bool isTypeLegal(const Type *Ty, EVT &VT, bool AllowI1 = false);
+};
+
+} // end anonymous namespace.
+
+bool X86FastISel::isTypeLegal(const Type *Ty, EVT &VT, bool AllowI1) {
+ VT = TLI.getValueType(Ty, /*HandleUnknown=*/true);
+ if (VT == MVT::Other || !VT.isSimple())
+ // Unhandled type. Halt "fast" selection and bail.
+ return false;
+
+ // For now, require SSE/SSE2 for performing floating-point operations,
+ // since x87 requires additional work.
+ if (VT == MVT::f64 && !X86ScalarSSEf64)
+ return false;
+ if (VT == MVT::f32 && !X86ScalarSSEf32)
+ return false;
+ // Similarly, no f80 support yet.
+ if (VT == MVT::f80)
+ return false;
+ // We only handle legal types. For example, on x86-32 the instruction
+ // selector contains all of the 64-bit instructions from x86-64,
+ // under the assumption that i64 won't be used if the target doesn't
+ // support it.
+ return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
+}
+
+#include "X86GenCallingConv.inc"
+
+/// CCAssignFnForCall - Selects the correct CCAssignFn for a given calling
+/// convention.
+CCAssignFn *X86FastISel::CCAssignFnForCall(CallingConv::ID CC,
+ bool isTaillCall) {
+ if (Subtarget->is64Bit()) {
+ if (CC == CallingConv::GHC)
+ return CC_X86_64_GHC;
+ else if (Subtarget->isTargetWin64())
+ return CC_X86_Win64_C;
+ else
+ return CC_X86_64_C;
+ }
+
+ if (CC == CallingConv::X86_FastCall)
+ return CC_X86_32_FastCall;
+ else if (CC == CallingConv::X86_ThisCall)
+ return CC_X86_32_ThisCall;
+ else if (CC == CallingConv::Fast)
+ return CC_X86_32_FastCC;
+ else if (CC == CallingConv::GHC)
+ return CC_X86_32_GHC;
+ else
+ return CC_X86_32_C;
+}
+
+/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
+/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
+/// Return true and the result register by reference if it is possible.
+bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
+ unsigned &ResultReg) {
+ // Get opcode and regclass of the output for the given load instruction.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = NULL;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i1:
+ case MVT::i8:
+ Opc = X86::MOV8rm;
+ RC = X86::GR8RegisterClass;
+ break;
+ case MVT::i16:
+ Opc = X86::MOV16rm;
+ RC = X86::GR16RegisterClass;
+ break;
+ case MVT::i32:
+ Opc = X86::MOV32rm;
+ RC = X86::GR32RegisterClass;
+ break;
+ case MVT::i64:
+ // Must be in x86-64 mode.
+ Opc = X86::MOV64rm;
+ RC = X86::GR64RegisterClass;
+ break;
+ case MVT::f32:
+ if (Subtarget->hasSSE1()) {
+ Opc = X86::MOVSSrm;
+ RC = X86::FR32RegisterClass;
+ } else {
+ Opc = X86::LD_Fp32m;
+ RC = X86::RFP32RegisterClass;
+ }
+ break;
+ case MVT::f64:
+ if (Subtarget->hasSSE2()) {
+ Opc = X86::MOVSDrm;
+ RC = X86::FR64RegisterClass;
+ } else {
+ Opc = X86::LD_Fp64m;
+ RC = X86::RFP64RegisterClass;
+ }
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return false;
+ }
+
+ ResultReg = createResultReg(RC);
+ addFullAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), AM);
+ return true;
+}
+
+/// X86FastEmitStore - Emit a machine instruction to store a value Val of
+/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
+/// and a displacement offset, or a GlobalAddress,
+/// i.e. V. Return true if it is possible.
+bool
+X86FastISel::X86FastEmitStore(EVT VT, unsigned Val,
+ const X86AddressMode &AM) {
+ // Get opcode and regclass of the output for the given store instruction.
+ unsigned Opc = 0;
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::f80: // No f80 support yet.
+ default: return false;
+ case MVT::i1: {
+ // Mask out all but lowest bit.
+ unsigned AndResult = createResultReg(X86::GR8RegisterClass);
+ BuildMI(MBB, DL,
+ TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1);
+ Val = AndResult;
+ }
+ // FALLTHROUGH, handling i1 as i8.
+ case MVT::i8: Opc = X86::MOV8mr; break;
+ case MVT::i16: Opc = X86::MOV16mr; break;
+ case MVT::i32: Opc = X86::MOV32mr; break;
+ case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
+ case MVT::f32:
+ Opc = Subtarget->hasSSE1() ? X86::MOVSSmr : X86::ST_Fp32m;
+ break;
+ case MVT::f64:
+ Opc = Subtarget->hasSSE2() ? X86::MOVSDmr : X86::ST_Fp64m;
+ break;
+ }
+
+ addFullAddress(BuildMI(MBB, DL, TII.get(Opc)), AM).addReg(Val);
+ return true;
+}
+
+bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
+ const X86AddressMode &AM) {
+ // Handle 'null' like i32/i64 0.
+ if (isa<ConstantPointerNull>(Val))
+ Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext()));
+
+ // If this is a store of a simple constant, fold the constant into the store.
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+ unsigned Opc = 0;
+ bool Signed = true;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::i1: Signed = false; // FALLTHROUGH to handle as i8.
+ case MVT::i8: Opc = X86::MOV8mi; break;
+ case MVT::i16: Opc = X86::MOV16mi; break;
+ case MVT::i32: Opc = X86::MOV32mi; break;
+ case MVT::i64:
+ // Must be a 32-bit sign extended value.
+ if ((int)CI->getSExtValue() == CI->getSExtValue())
+ Opc = X86::MOV64mi32;
+ break;
+ }
+
+ if (Opc) {
+ addFullAddress(BuildMI(MBB, DL, TII.get(Opc)), AM)
+ .addImm(Signed ? (uint64_t) CI->getSExtValue() :
+ CI->getZExtValue());
+ return true;
+ }
+ }
+
+ unsigned ValReg = getRegForValue(Val);
+ if (ValReg == 0)
+ return false;
+
+ return X86FastEmitStore(VT, ValReg, AM);
+}
+
+/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
+/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
+/// ISD::SIGN_EXTEND).
+bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
+ unsigned Src, EVT SrcVT,
+ unsigned &ResultReg) {
+ unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
+ Src, /*TODO: Kill=*/false);
+
+ if (RR != 0) {
+ ResultReg = RR;
+ return true;
+ } else
+ return false;
+}
+
+/// X86SelectAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
+ const User *U = NULL;
+ unsigned Opcode = Instruction::UserOp1;
+ if (const Instruction *I = dyn_cast<Instruction>(V)) {
+ Opcode = I->getOpcode();
+ U = I;
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ switch (Opcode) {
+ default: break;
+ case Instruction::BitCast:
+ // Look past bitcasts.
+ return X86SelectAddress(U->getOperand(0), AM);
+
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs.
+ if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+ return X86SelectAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints.
+ if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+ return X86SelectAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::Alloca: {
+ // Do static allocas.
+ const AllocaInst *A = cast<AllocaInst>(V);
+ DenseMap<const AllocaInst*, int>::iterator SI = StaticAllocaMap.find(A);
+ if (SI != StaticAllocaMap.end()) {
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = SI->second;
+ return true;
+ }
+ break;
+ }
+
+ case Instruction::Add: {
+ // Adds of constants are common and easy enough.
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+ uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
+ // They have to fit in the 32-bit signed displacement field though.
+ if (isInt<32>(Disp)) {
+ AM.Disp = (uint32_t)Disp;
+ return X86SelectAddress(U->getOperand(0), AM);
+ }
+ }
+ break;
+ }
+
+ case Instruction::GetElementPtr: {
+ X86AddressMode SavedAM = AM;
+
+ // Pattern-match simple GEPs.
+ uint64_t Disp = (int32_t)AM.Disp;
+ unsigned IndexReg = AM.IndexReg;
+ unsigned Scale = AM.Scale;
+ gep_type_iterator GTI = gep_type_begin(U);
+ // Iterate through the indices, folding what we can. Constants can be
+ // folded, and one dynamic index can be handled, if the scale is supported.
+ for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
+ i != e; ++i, ++GTI) {
+ const Value *Op = *i;
+ if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+ const StructLayout *SL = TD.getStructLayout(STy);
+ unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+ Disp += SL->getElementOffset(Idx);
+ } else {
+ uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+ // Constant-offset addressing.
+ Disp += CI->getSExtValue() * S;
+ } else if (IndexReg == 0 &&
+ (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
+ (S == 1 || S == 2 || S == 4 || S == 8)) {
+ // Scaled-index addressing.
+ Scale = S;
+ IndexReg = getRegForGEPIndex(Op).first;
+ if (IndexReg == 0)
+ return false;
+ } else
+ // Unsupported.
+ goto unsupported_gep;
+ }
+ }
+ // Check for displacement overflow.
+ if (!isInt<32>(Disp))
+ break;
+ // Ok, the GEP indices were covered by constant-offset and scaled-index
+ // addressing. Update the address state and move on to examining the base.
+ AM.IndexReg = IndexReg;
+ AM.Scale = Scale;
+ AM.Disp = (uint32_t)Disp;
+ if (X86SelectAddress(U->getOperand(0), AM))
+ return true;
+
+ // If we couldn't merge the sub value into this addr mode, revert back to
+ // our address and just match the value instead of completely failing.
+ AM = SavedAM;
+ break;
+ unsupported_gep:
+ // Ok, the GEP indices weren't all covered.
+ break;
+ }
+ }
+
+ // Handle constant address.
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Small)
+ return false;
+
+ // RIP-relative addresses can't have additional register operands.
+ if (Subtarget->isPICStyleRIPRel() &&
+ (AM.Base.Reg != 0 || AM.IndexReg != 0))
+ return false;
+
+ // Can't handle TLS yet.
+ if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+ if (GVar->isThreadLocal())
+ return false;
+
+ // Okay, we've committed to selecting this global. Set up the basic address.
+ AM.GV = GV;
+
+ // Allow the subtarget to classify the global.
+ unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+ // If this reference is relative to the pic base, set it now.
+ if (isGlobalRelativeToPICBase(GVFlags)) {
+ // FIXME: How do we know Base.Reg is free??
+ AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(&MF);
+ }
+
+ // Unless the ABI requires an extra load, return a direct reference to
+ // the global.
+ if (!isGlobalStubReference(GVFlags)) {
+ if (Subtarget->isPICStyleRIPRel()) {
+ // Use rip-relative addressing if we can. Above we verified that the
+ // base and index registers are unused.
+ assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+ AM.Base.Reg = X86::RIP;
+ }
+ AM.GVOpFlags = GVFlags;
+ return true;
+ }
+
+ // Ok, we need to do a load from a stub. If we've already loaded from this
+ // stub, reuse the loaded pointer, otherwise emit the load now.
+ DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
+ unsigned LoadReg;
+ if (I != LocalValueMap.end() && I->second != 0) {
+ LoadReg = I->second;
+ } else {
+ // Issue load from stub.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = NULL;
+ X86AddressMode StubAM;
+ StubAM.Base.Reg = AM.Base.Reg;
+ StubAM.GV = GV;
+ StubAM.GVOpFlags = GVFlags;
+
+ if (TLI.getPointerTy() == MVT::i64) {
+ Opc = X86::MOV64rm;
+ RC = X86::GR64RegisterClass;
+
+ if (Subtarget->isPICStyleRIPRel())
+ StubAM.Base.Reg = X86::RIP;
+ } else {
+ Opc = X86::MOV32rm;
+ RC = X86::GR32RegisterClass;
+ }
+
+ LoadReg = createResultReg(RC);
+ addFullAddress(BuildMI(MBB, DL, TII.get(Opc), LoadReg), StubAM);
+
+ // Prevent loading GV stub multiple times in same MBB.
+ LocalValueMap[V] = LoadReg;
+ }
+
+ // Now construct the final address. Note that the Disp, Scale,
+ // and Index values may already be set here.
+ AM.Base.Reg = LoadReg;
+ AM.GV = 0;
+ return true;
+ }
+
+ // If all else fails, try to materialize the value in a register.
+ if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+ if (AM.Base.Reg == 0) {
+ AM.Base.Reg = getRegForValue(V);
+ return AM.Base.Reg != 0;
+ }
+ if (AM.IndexReg == 0) {
+ assert(AM.Scale == 1 && "Scale with no index!");
+ AM.IndexReg = getRegForValue(V);
+ return AM.IndexReg != 0;
+ }
+ }
+
+ return false;
+}
+
+/// X86SelectCallAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
+ const User *U = NULL;
+ unsigned Opcode = Instruction::UserOp1;
+ if (const Instruction *I = dyn_cast<Instruction>(V)) {
+ Opcode = I->getOpcode();
+ U = I;
+ } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ switch (Opcode) {
+ default: break;
+ case Instruction::BitCast:
+ // Look past bitcasts.
+ return X86SelectCallAddress(U->getOperand(0), AM);
+
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs.
+ if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+ return X86SelectCallAddress(U->getOperand(0), AM);
+ break;
+
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints.
+ if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+ return X86SelectCallAddress(U->getOperand(0), AM);
+ break;
+ }
+
+ // Handle constant address.
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+ // Can't handle alternate code models yet.
+ if (TM.getCodeModel() != CodeModel::Small)
+ return false;
+
+ // RIP-relative addresses can't have additional register operands.
+ if (Subtarget->isPICStyleRIPRel() &&
+ (AM.Base.Reg != 0 || AM.IndexReg != 0))
+ return false;
+
+ // Can't handle TLS or DLLImport.
+ if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+ if (GVar->isThreadLocal() || GVar->hasDLLImportLinkage())
+ return false;
+
+ // Okay, we've committed to selecting this global. Set up the basic address.
+ AM.GV = GV;
+
+ // No ABI requires an extra load for anything other than DLLImport, which
+ // we rejected above. Return a direct reference to the global.
+ if (Subtarget->isPICStyleRIPRel()) {
+ // Use rip-relative addressing if we can. Above we verified that the
+ // base and index registers are unused.
+ assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+ AM.Base.Reg = X86::RIP;
+ } else if (Subtarget->isPICStyleStubPIC()) {
+ AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET;
+ } else if (Subtarget->isPICStyleGOT()) {
+ AM.GVOpFlags = X86II::MO_GOTOFF;
+ }
+
+ return true;
+ }
+
+ // If all else fails, try to materialize the value in a register.
+ if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+ if (AM.Base.Reg == 0) {
+ AM.Base.Reg = getRegForValue(V);
+ return AM.Base.Reg != 0;
+ }
+ if (AM.IndexReg == 0) {
+ assert(AM.Scale == 1 && "Scale with no index!");
+ AM.IndexReg = getRegForValue(V);
+ return AM.IndexReg != 0;
+ }
+ }
+
+ return false;
+}
+
+
+/// X86SelectStore - Select and emit code to implement store instructions.
+bool X86FastISel::X86SelectStore(const Instruction *I) {
+ EVT VT;
+ if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true))
+ return false;
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(I->getOperand(1), AM))
+ return false;
+
+ return X86FastEmitStore(VT, I->getOperand(0), AM);
+}
+
+/// X86SelectLoad - Select and emit code to implement load instructions.
+///
+bool X86FastISel::X86SelectLoad(const Instruction *I) {
+ EVT VT;
+ if (!isTypeLegal(I->getType(), VT, /*AllowI1=*/true))
+ return false;
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(I->getOperand(0), AM))
+ return false;
+
+ unsigned ResultReg = 0;
+ if (X86FastEmitLoad(VT, AM, ResultReg)) {
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ return false;
+}
+
+static unsigned X86ChooseCmpOpcode(EVT VT) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return 0;
+ case MVT::i8: return X86::CMP8rr;
+ case MVT::i16: return X86::CMP16rr;
+ case MVT::i32: return X86::CMP32rr;
+ case MVT::i64: return X86::CMP64rr;
+ case MVT::f32: return X86::UCOMISSrr;
+ case MVT::f64: return X86::UCOMISDrr;
+ }
+}
+
+/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS
+/// of the comparison, return an opcode that works for the compare (e.g.
+/// CMP32ri) otherwise return 0.
+static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ // Otherwise, we can't fold the immediate into this comparison.
+ default: return 0;
+ case MVT::i8: return X86::CMP8ri;
+ case MVT::i16: return X86::CMP16ri;
+ case MVT::i32: return X86::CMP32ri;
+ case MVT::i64:
+ // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
+ // field.
+ if ((int)RHSC->getSExtValue() == RHSC->getSExtValue())
+ return X86::CMP64ri32;
+ return 0;
+ }
+}
+
+bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
+ EVT VT) {
+ unsigned Op0Reg = getRegForValue(Op0);
+ if (Op0Reg == 0) return false;
+
+ // Handle 'null' like i32/i64 0.
+ if (isa<ConstantPointerNull>(Op1))
+ Op1 = Constant::getNullValue(TD.getIntPtrType(Op0->getContext()));
+
+ // We have two options: compare with register or immediate. If the RHS of
+ // the compare is an immediate that we can fold into this compare, use
+ // CMPri, otherwise use CMPrr.
+ if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+ if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
+ BuildMI(MBB, DL, TII.get(CompareImmOpc)).addReg(Op0Reg)
+ .addImm(Op1C->getSExtValue());
+ return true;
+ }
+ }
+
+ unsigned CompareOpc = X86ChooseCmpOpcode(VT);
+ if (CompareOpc == 0) return false;
+
+ unsigned Op1Reg = getRegForValue(Op1);
+ if (Op1Reg == 0) return false;
+ BuildMI(MBB, DL, TII.get(CompareOpc)).addReg(Op0Reg).addReg(Op1Reg);
+
+ return true;
+}
+
+bool X86FastISel::X86SelectCmp(const Instruction *I) {
+ const CmpInst *CI = cast<CmpInst>(I);
+
+ EVT VT;
+ if (!isTypeLegal(I->getOperand(0)->getType(), VT))
+ return false;
+
+ unsigned ResultReg = createResultReg(&X86::GR8RegClass);
+ unsigned SetCCOpc;
+ bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0.
+ switch (CI->getPredicate()) {
+ case CmpInst::FCMP_OEQ: {
+ if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+ return false;
+
+ unsigned EReg = createResultReg(&X86::GR8RegClass);
+ unsigned NPReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(MBB, DL, TII.get(X86::SETEr), EReg);
+ BuildMI(MBB, DL, TII.get(X86::SETNPr), NPReg);
+ BuildMI(MBB, DL,
+ TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ case CmpInst::FCMP_UNE: {
+ if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+ return false;
+
+ unsigned NEReg = createResultReg(&X86::GR8RegClass);
+ unsigned PReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(MBB, DL, TII.get(X86::SETNEr), NEReg);
+ BuildMI(MBB, DL, TII.get(X86::SETPr), PReg);
+ BuildMI(MBB, DL, TII.get(X86::OR8rr), ResultReg).addReg(PReg).addReg(NEReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ case CmpInst::FCMP_OGT: SwapArgs = false; SetCCOpc = X86::SETAr; break;
+ case CmpInst::FCMP_OGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
+ case CmpInst::FCMP_OLT: SwapArgs = true; SetCCOpc = X86::SETAr; break;
+ case CmpInst::FCMP_OLE: SwapArgs = true; SetCCOpc = X86::SETAEr; break;
+ case CmpInst::FCMP_ONE: SwapArgs = false; SetCCOpc = X86::SETNEr; break;
+ case CmpInst::FCMP_ORD: SwapArgs = false; SetCCOpc = X86::SETNPr; break;
+ case CmpInst::FCMP_UNO: SwapArgs = false; SetCCOpc = X86::SETPr; break;
+ case CmpInst::FCMP_UEQ: SwapArgs = false; SetCCOpc = X86::SETEr; break;
+ case CmpInst::FCMP_UGT: SwapArgs = true; SetCCOpc = X86::SETBr; break;
+ case CmpInst::FCMP_UGE: SwapArgs = true; SetCCOpc = X86::SETBEr; break;
+ case CmpInst::FCMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break;
+ case CmpInst::FCMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
+
+ case CmpInst::ICMP_EQ: SwapArgs = false; SetCCOpc = X86::SETEr; break;
+ case CmpInst::ICMP_NE: SwapArgs = false; SetCCOpc = X86::SETNEr; break;
+ case CmpInst::ICMP_UGT: SwapArgs = false; SetCCOpc = X86::SETAr; break;
+ case CmpInst::ICMP_UGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
+ case CmpInst::ICMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break;
+ case CmpInst::ICMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
+ case CmpInst::ICMP_SGT: SwapArgs = false; SetCCOpc = X86::SETGr; break;
+ case CmpInst::ICMP_SGE: SwapArgs = false; SetCCOpc = X86::SETGEr; break;
+ case CmpInst::ICMP_SLT: SwapArgs = false; SetCCOpc = X86::SETLr; break;
+ case CmpInst::ICMP_SLE: SwapArgs = false; SetCCOpc = X86::SETLEr; break;
+ default:
+ return false;
+ }
+
+ const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+ if (SwapArgs)
+ std::swap(Op0, Op1);
+
+ // Emit a compare of Op0/Op1.
+ if (!X86FastEmitCompare(Op0, Op1, VT))
+ return false;
+
+ BuildMI(MBB, DL, TII.get(SetCCOpc), ResultReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectZExt(const Instruction *I) {
+ // Handle zero-extension from i1 to i8, which is common.
+ if (I->getType()->isIntegerTy(8) &&
+ I->getOperand(0)->getType()->isIntegerTy(1)) {
+ unsigned ResultReg = getRegForValue(I->getOperand(0));
+ if (ResultReg == 0) return false;
+ // Set the high bits to zero.
+ ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+ if (ResultReg == 0) return false;
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+
+ return false;
+}
+
+
+bool X86FastISel::X86SelectBranch(const Instruction *I) {
+ // Unconditional branches are selected by tablegen-generated code.
+ // Handle a conditional branch.
+ const BranchInst *BI = cast<BranchInst>(I);
+ MachineBasicBlock *TrueMBB = MBBMap[BI->getSuccessor(0)];
+ MachineBasicBlock *FalseMBB = MBBMap[BI->getSuccessor(1)];
+
+ // Fold the common case of a conditional branch with a comparison.
+ if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+ if (CI->hasOneUse()) {
+ EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
+
+ // Try to take advantage of fallthrough opportunities.
+ CmpInst::Predicate Predicate = CI->getPredicate();
+ if (MBB->isLayoutSuccessor(TrueMBB)) {
+ std::swap(TrueMBB, FalseMBB);
+ Predicate = CmpInst::getInversePredicate(Predicate);
+ }
+
+ bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0.
+ unsigned BranchOpc; // Opcode to jump on, e.g. "X86::JA"
+
+ switch (Predicate) {
+ case CmpInst::FCMP_OEQ:
+ std::swap(TrueMBB, FalseMBB);
+ Predicate = CmpInst::FCMP_UNE;
+ // FALL THROUGH
+ case CmpInst::FCMP_UNE: SwapArgs = false; BranchOpc = X86::JNE_4; break;
+ case CmpInst::FCMP_OGT: SwapArgs = false; BranchOpc = X86::JA_4; break;
+ case CmpInst::FCMP_OGE: SwapArgs = false; BranchOpc = X86::JAE_4; break;
+ case CmpInst::FCMP_OLT: SwapArgs = true; BranchOpc = X86::JA_4; break;
+ case CmpInst::FCMP_OLE: SwapArgs = true; BranchOpc = X86::JAE_4; break;
+ case CmpInst::FCMP_ONE: SwapArgs = false; BranchOpc = X86::JNE_4; break;
+ case CmpInst::FCMP_ORD: SwapArgs = false; BranchOpc = X86::JNP_4; break;
+ case CmpInst::FCMP_UNO: SwapArgs = false; BranchOpc = X86::JP_4; break;
+ case CmpInst::FCMP_UEQ: SwapArgs = false; BranchOpc = X86::JE_4; break;
+ case CmpInst::FCMP_UGT: SwapArgs = true; BranchOpc = X86::JB_4; break;
+ case CmpInst::FCMP_UGE: SwapArgs = true; BranchOpc = X86::JBE_4; break;
+ case CmpInst::FCMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break;
+ case CmpInst::FCMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break;
+
+ case CmpInst::ICMP_EQ: SwapArgs = false; BranchOpc = X86::JE_4; break;
+ case CmpInst::ICMP_NE: SwapArgs = false; BranchOpc = X86::JNE_4; break;
+ case CmpInst::ICMP_UGT: SwapArgs = false; BranchOpc = X86::JA_4; break;
+ case CmpInst::ICMP_UGE: SwapArgs = false; BranchOpc = X86::JAE_4; break;
+ case CmpInst::ICMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break;
+ case CmpInst::ICMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break;
+ case CmpInst::ICMP_SGT: SwapArgs = false; BranchOpc = X86::JG_4; break;
+ case CmpInst::ICMP_SGE: SwapArgs = false; BranchOpc = X86::JGE_4; break;
+ case CmpInst::ICMP_SLT: SwapArgs = false; BranchOpc = X86::JL_4; break;
+ case CmpInst::ICMP_SLE: SwapArgs = false; BranchOpc = X86::JLE_4; break;
+ default:
+ return false;
+ }
+
+ const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+ if (SwapArgs)
+ std::swap(Op0, Op1);
+
+ // Emit a compare of the LHS and RHS, setting the flags.
+ if (!X86FastEmitCompare(Op0, Op1, VT))
+ return false;
+
+ BuildMI(MBB, DL, TII.get(BranchOpc)).addMBB(TrueMBB);
+
+ if (Predicate == CmpInst::FCMP_UNE) {
+ // X86 requires a second branch to handle UNE (and OEQ,
+ // which is mapped to UNE above).
+ BuildMI(MBB, DL, TII.get(X86::JP_4)).addMBB(TrueMBB);
+ }
+
+ FastEmitBranch(FalseMBB);
+ MBB->addSuccessor(TrueMBB);
+ return true;
+ }
+ } else if (ExtractValueInst *EI =
+ dyn_cast<ExtractValueInst>(BI->getCondition())) {
+ // Check to see if the branch instruction is from an "arithmetic with
+ // overflow" intrinsic. The main way these intrinsics are used is:
+ //
+ // %t = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+ // %sum = extractvalue { i32, i1 } %t, 0
+ // %obit = extractvalue { i32, i1 } %t, 1
+ // br i1 %obit, label %overflow, label %normal
+ //
+ // The %sum and %obit are converted in an ADD and a SETO/SETB before
+ // reaching the branch. Therefore, we search backwards through the MBB
+ // looking for the SETO/SETB instruction. If an instruction modifies the
+ // EFLAGS register before we reach the SETO/SETB instruction, then we can't
+ // convert the branch into a JO/JB instruction.
+ if (const IntrinsicInst *CI =
+ dyn_cast<IntrinsicInst>(EI->getAggregateOperand())){
+ if (CI->getIntrinsicID() == Intrinsic::sadd_with_overflow ||
+ CI->getIntrinsicID() == Intrinsic::uadd_with_overflow) {
+ const MachineInstr *SetMI = 0;
+ unsigned Reg = lookUpRegForValue(EI);
+
+ for (MachineBasicBlock::const_reverse_iterator
+ RI = MBB->rbegin(), RE = MBB->rend(); RI != RE; ++RI) {
+ const MachineInstr &MI = *RI;
+
+ if (MI.definesRegister(Reg)) {
+ unsigned Src, Dst, SrcSR, DstSR;
+
+ if (getInstrInfo()->isMoveInstr(MI, Src, Dst, SrcSR, DstSR)) {
+ Reg = Src;
+ continue;
+ }
+
+ SetMI = &MI;
+ break;
+ }
+
+ const TargetInstrDesc &TID = MI.getDesc();
+ if (TID.hasUnmodeledSideEffects() ||
+ TID.hasImplicitDefOfPhysReg(X86::EFLAGS))
+ break;
+ }
+
+ if (SetMI) {
+ unsigned OpCode = SetMI->getOpcode();
+
+ if (OpCode == X86::SETOr || OpCode == X86::SETBr) {
+ BuildMI(MBB, DL, TII.get(OpCode == X86::SETOr ?
+ X86::JO_4 : X86::JB_4))
+ .addMBB(TrueMBB);
+ FastEmitBranch(FalseMBB);
+ MBB->addSuccessor(TrueMBB);
+ return true;
+ }
+ }
+ }
+ }
+ }
+
+ // Otherwise do a clumsy setcc and re-test it.
+ unsigned OpReg = getRegForValue(BI->getCondition());
+ if (OpReg == 0) return false;
+
+ BuildMI(MBB, DL, TII.get(X86::TEST8rr)).addReg(OpReg).addReg(OpReg);
+ BuildMI(MBB, DL, TII.get(X86::JNE_4)).addMBB(TrueMBB);
+ FastEmitBranch(FalseMBB);
+ MBB->addSuccessor(TrueMBB);
+ return true;
+}
+
+bool X86FastISel::X86SelectShift(const Instruction *I) {
+ unsigned CReg = 0, OpReg = 0, OpImm = 0;
+ const TargetRegisterClass *RC = NULL;
+ if (I->getType()->isIntegerTy(8)) {
+ CReg = X86::CL;
+ RC = &X86::GR8RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR8rCL; OpImm = X86::SHR8ri; break;
+ case Instruction::AShr: OpReg = X86::SAR8rCL; OpImm = X86::SAR8ri; break;
+ case Instruction::Shl: OpReg = X86::SHL8rCL; OpImm = X86::SHL8ri; break;
+ default: return false;
+ }
+ } else if (I->getType()->isIntegerTy(16)) {
+ CReg = X86::CX;
+ RC = &X86::GR16RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR16rCL; OpImm = X86::SHR16ri; break;
+ case Instruction::AShr: OpReg = X86::SAR16rCL; OpImm = X86::SAR16ri; break;
+ case Instruction::Shl: OpReg = X86::SHL16rCL; OpImm = X86::SHL16ri; break;
+ default: return false;
+ }
+ } else if (I->getType()->isIntegerTy(32)) {
+ CReg = X86::ECX;
+ RC = &X86::GR32RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR32rCL; OpImm = X86::SHR32ri; break;
+ case Instruction::AShr: OpReg = X86::SAR32rCL; OpImm = X86::SAR32ri; break;
+ case Instruction::Shl: OpReg = X86::SHL32rCL; OpImm = X86::SHL32ri; break;
+ default: return false;
+ }
+ } else if (I->getType()->isIntegerTy(64)) {
+ CReg = X86::RCX;
+ RC = &X86::GR64RegClass;
+ switch (I->getOpcode()) {
+ case Instruction::LShr: OpReg = X86::SHR64rCL; OpImm = X86::SHR64ri; break;
+ case Instruction::AShr: OpReg = X86::SAR64rCL; OpImm = X86::SAR64ri; break;
+ case Instruction::Shl: OpReg = X86::SHL64rCL; OpImm = X86::SHL64ri; break;
+ default: return false;
+ }
+ } else {
+ return false;
+ }
+
+ EVT VT = TLI.getValueType(I->getType(), /*HandleUnknown=*/true);
+ if (VT == MVT::Other || !isTypeLegal(I->getType(), VT))
+ return false;
+
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (Op0Reg == 0) return false;
+
+ // Fold immediate in shl(x,3).
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(MBB, DL, TII.get(OpImm),
+ ResultReg).addReg(Op0Reg).addImm(CI->getZExtValue() & 0xff);
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (Op1Reg == 0) return false;
+ TII.copyRegToReg(*MBB, MBB->end(), CReg, Op1Reg, RC, RC, DL);
+
+ // The shift instruction uses X86::CL. If we defined a super-register
+ // of X86::CL, emit an EXTRACT_SUBREG to precisely describe what
+ // we're doing here.
+ if (CReg != X86::CL)
+ BuildMI(MBB, DL, TII.get(TargetOpcode::EXTRACT_SUBREG), X86::CL)
+ .addReg(CReg).addImm(X86::sub_8bit);
+
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(MBB, DL, TII.get(OpReg), ResultReg).addReg(Op0Reg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectSelect(const Instruction *I) {
+ EVT VT = TLI.getValueType(I->getType(), /*HandleUnknown=*/true);
+ if (VT == MVT::Other || !isTypeLegal(I->getType(), VT))
+ return false;
+
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = NULL;
+ if (VT.getSimpleVT() == MVT::i16) {
+ Opc = X86::CMOVE16rr;
+ RC = &X86::GR16RegClass;
+ } else if (VT.getSimpleVT() == MVT::i32) {
+ Opc = X86::CMOVE32rr;
+ RC = &X86::GR32RegClass;
+ } else if (VT.getSimpleVT() == MVT::i64) {
+ Opc = X86::CMOVE64rr;
+ RC = &X86::GR64RegClass;
+ } else {
+ return false;
+ }
+
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (Op0Reg == 0) return false;
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (Op1Reg == 0) return false;
+ unsigned Op2Reg = getRegForValue(I->getOperand(2));
+ if (Op2Reg == 0) return false;
+
+ BuildMI(MBB, DL, TII.get(X86::TEST8rr)).addReg(Op0Reg).addReg(Op0Reg);
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(MBB, DL, TII.get(Opc), ResultReg).addReg(Op1Reg).addReg(Op2Reg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectFPExt(const Instruction *I) {
+ // fpext from float to double.
+ if (Subtarget->hasSSE2() &&
+ I->getType()->isDoubleTy()) {
+ const Value *V = I->getOperand(0);
+ if (V->getType()->isFloatTy()) {
+ unsigned OpReg = getRegForValue(V);
+ if (OpReg == 0) return false;
+ unsigned ResultReg = createResultReg(X86::FR64RegisterClass);
+ BuildMI(MBB, DL, TII.get(X86::CVTSS2SDrr), ResultReg).addReg(OpReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
+ if (Subtarget->hasSSE2()) {
+ if (I->getType()->isFloatTy()) {
+ const Value *V = I->getOperand(0);
+ if (V->getType()->isDoubleTy()) {
+ unsigned OpReg = getRegForValue(V);
+ if (OpReg == 0) return false;
+ unsigned ResultReg = createResultReg(X86::FR32RegisterClass);
+ BuildMI(MBB, DL, TII.get(X86::CVTSD2SSrr), ResultReg).addReg(OpReg);
+ UpdateValueMap(I, ResultReg);
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+bool X86FastISel::X86SelectTrunc(const Instruction *I) {
+ if (Subtarget->is64Bit())
+ // All other cases should be handled by the tblgen generated code.
+ return false;
+ EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+ EVT DstVT = TLI.getValueType(I->getType());
+
+ // This code only handles truncation to byte right now.
+ if (DstVT != MVT::i8 && DstVT != MVT::i1)
+ // All other cases should be handled by the tblgen generated code.
+ return false;
+ if (SrcVT != MVT::i16 && SrcVT != MVT::i32)
+ // All other cases should be handled by the tblgen generated code.
+ return false;
+
+ unsigned InputReg = getRegForValue(I->getOperand(0));
+ if (!InputReg)
+ // Unhandled operand. Halt "fast" selection and bail.
+ return false;
+
+ // First issue a copy to GR16_ABCD or GR32_ABCD.
+ unsigned CopyOpc = (SrcVT == MVT::i16) ? X86::MOV16rr : X86::MOV32rr;
+ const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16)
+ ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass;
+ unsigned CopyReg = createResultReg(CopyRC);
+ BuildMI(MBB, DL, TII.get(CopyOpc), CopyReg).addReg(InputReg);
+
+ // Then issue an extract_subreg.
+ unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8,
+ CopyReg, /*Kill=*/true,
+ X86::sub_8bit);
+ if (!ResultReg)
+ return false;
+
+ UpdateValueMap(I, ResultReg);
+ return true;
+}
+
+bool X86FastISel::X86SelectExtractValue(const Instruction *I) {
+ const ExtractValueInst *EI = cast<ExtractValueInst>(I);
+ const Value *Agg = EI->getAggregateOperand();
+
+ if (const IntrinsicInst *CI = dyn_cast<IntrinsicInst>(Agg)) {
+ switch (CI->getIntrinsicID()) {
+ default: break;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ // Cheat a little. We know that the registers for "add" and "seto" are
+ // allocated sequentially. However, we only keep track of the register
+ // for "add" in the value map. Use extractvalue's index to get the
+ // correct register for "seto".
+ UpdateValueMap(I, lookUpRegForValue(Agg) + *EI->idx_begin());
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
+ // FIXME: Handle more intrinsics.
+ switch (I.getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::stackprotector: {
+ // Emit code inline code to store the stack guard onto the stack.
+ EVT PtrTy = TLI.getPointerTy();
+
+ const Value *Op1 = I.getOperand(1); // The guard's value.
+ const AllocaInst *Slot = cast<AllocaInst>(I.getOperand(2));
+
+ // Grab the frame index.
+ X86AddressMode AM;
+ if (!X86SelectAddress(Slot, AM)) return false;
+
+ if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
+
+ return true;
+ }
+ case Intrinsic::objectsize: {
+ ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(2));
+ const Type *Ty = I.getCalledFunction()->getReturnType();
+
+ assert(CI && "Non-constant type in Intrinsic::objectsize?");
+
+ EVT VT;
+ if (!isTypeLegal(Ty, VT))
+ return false;
+
+ unsigned OpC = 0;
+ if (VT == MVT::i32)
+ OpC = X86::MOV32ri;
+ else if (VT == MVT::i64)
+ OpC = X86::MOV64ri;
+ else
+ return false;
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(MBB, DL, TII.get(OpC), ResultReg).
+ addImm(CI->getZExtValue() == 0 ? -1ULL : 0);
+ UpdateValueMap(&I, ResultReg);
+ return true;
+ }
+ case Intrinsic::dbg_declare: {
+ const DbgDeclareInst *DI = cast<DbgDeclareInst>(&I);
+ X86AddressMode AM;
+ assert(DI->getAddress() && "Null address should be checked earlier!");
+ if (!X86SelectAddress(DI->getAddress(), AM))
+ return false;
+ const TargetInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
+ // FIXME may need to add RegState::Debug to any registers produced,
+ // although ESP/EBP should be the only ones at the moment.
+ addFullAddress(BuildMI(MBB, DL, II), AM).addImm(0).
+ addMetadata(DI->getVariable());
+ return true;
+ }
+ case Intrinsic::trap: {
+ BuildMI(MBB, DL, TII.get(X86::TRAP));
+ return true;
+ }
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow: {
+ // Replace "add with overflow" intrinsics with an "add" instruction followed
+ // by a seto/setc instruction. Later on, when the "extractvalue"
+ // instructions are encountered, we use the fact that two registers were
+ // created sequentially to get the correct registers for the "sum" and the
+ // "overflow bit".
+ const Function *Callee = I.getCalledFunction();
+ const Type *RetTy =
+ cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
+
+ EVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ const Value *Op1 = I.getOperand(1);
+ const Value *Op2 = I.getOperand(2);
+ unsigned Reg1 = getRegForValue(Op1);
+ unsigned Reg2 = getRegForValue(Op2);
+
+ if (Reg1 == 0 || Reg2 == 0)
+ // FIXME: Handle values *not* in registers.
+ return false;
+
+ unsigned OpC = 0;
+ if (VT == MVT::i32)
+ OpC = X86::ADD32rr;
+ else if (VT == MVT::i64)
+ OpC = X86::ADD64rr;
+ else
+ return false;
+
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(MBB, DL, TII.get(OpC), ResultReg).addReg(Reg1).addReg(Reg2);
+ unsigned DestReg1 = UpdateValueMap(&I, ResultReg);
+
+ // If the add with overflow is an intra-block value then we just want to
+ // create temporaries for it like normal. If it is a cross-block value then
+ // UpdateValueMap will return the cross-block register used. Since we
+ // *really* want the value to be live in the register pair known by
+ // UpdateValueMap, we have to use DestReg1+1 as the destination register in
+ // the cross block case. In the non-cross-block case, we should just make
+ // another register for the value.
+ if (DestReg1 != ResultReg)
+ ResultReg = DestReg1+1;
+ else
+ ResultReg = createResultReg(TLI.getRegClassFor(MVT::i8));
+
+ unsigned Opc = X86::SETBr;
+ if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow)
+ Opc = X86::SETOr;
+ BuildMI(MBB, DL, TII.get(Opc), ResultReg);
+ return true;
+ }
+ }
+}
+
+bool X86FastISel::X86SelectCall(const Instruction *I) {
+ const CallInst *CI = cast<CallInst>(I);
+ const Value *Callee = I->getOperand(0);
+
+ // Can't handle inline asm yet.
+ if (isa<InlineAsm>(Callee))
+ return false;
+
+ // Handle intrinsic calls.
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI))
+ return X86VisitIntrinsicCall(*II);
+
+ // Handle only C and fastcc calling conventions for now.
+ ImmutableCallSite CS(CI);
+ CallingConv::ID CC = CS.getCallingConv();
+ if (CC != CallingConv::C &&
+ CC != CallingConv::Fast &&
+ CC != CallingConv::X86_FastCall)
+ return false;
+
+ // fastcc with -tailcallopt is intended to provide a guaranteed
+ // tail call optimization. Fastisel doesn't know how to do that.
+ if (CC == CallingConv::Fast && GuaranteedTailCallOpt)
+ return false;
+
+ // Let SDISel handle vararg functions.
+ const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+ const FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+ if (FTy->isVarArg())
+ return false;
+
+ // Handle *simple* calls for now.
+ const Type *RetTy = CS.getType();
+ EVT RetVT;
+ if (RetTy->isVoidTy())
+ RetVT = MVT::isVoid;
+ else if (!isTypeLegal(RetTy, RetVT, true))
+ return false;
+
+ // Materialize callee address in a register. FIXME: GV address can be
+ // handled with a CALLpcrel32 instead.
+ X86AddressMode CalleeAM;
+ if (!X86SelectCallAddress(Callee, CalleeAM))
+ return false;
+ unsigned CalleeOp = 0;
+ const GlobalValue *GV = 0;
+ if (CalleeAM.GV != 0) {
+ GV = CalleeAM.GV;
+ } else if (CalleeAM.Base.Reg != 0) {
+ CalleeOp = CalleeAM.Base.Reg;
+ } else
+ return false;
+
+ // Allow calls which produce i1 results.
+ bool AndToI1 = false;
+ if (RetVT == MVT::i1) {
+ RetVT = MVT::i8;
+ AndToI1 = true;
+ }
+
+ // Deal with call operands first.
+ SmallVector<const Value *, 8> ArgVals;
+ SmallVector<unsigned, 8> Args;
+ SmallVector<EVT, 8> ArgVTs;
+ SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+ Args.reserve(CS.arg_size());
+ ArgVals.reserve(CS.arg_size());
+ ArgVTs.reserve(CS.arg_size());
+ ArgFlags.reserve(CS.arg_size());
+ for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+ i != e; ++i) {
+ unsigned Arg = getRegForValue(*i);
+ if (Arg == 0)
+ return false;
+ ISD::ArgFlagsTy Flags;
+ unsigned AttrInd = i - CS.arg_begin() + 1;
+ if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+ Flags.setSExt();
+ if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+ Flags.setZExt();
+
+ // FIXME: Only handle *easy* calls for now.
+ if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
+ CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+ CS.paramHasAttr(AttrInd, Attribute::Nest) ||
+ CS.paramHasAttr(AttrInd, Attribute::ByVal))
+ return false;
+
+ const Type *ArgTy = (*i)->getType();
+ EVT ArgVT;
+ if (!isTypeLegal(ArgTy, ArgVT))
+ return false;
+ unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+ Flags.setOrigAlign(OriginalAlignment);
+
+ Args.push_back(Arg);
+ ArgVals.push_back(*i);
+ ArgVTs.push_back(ArgVT);
+ ArgFlags.push_back(Flags);
+ }
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, false, TM, ArgLocs, I->getParent()->getContext());
+ CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ // Issue CALLSEQ_START
+ unsigned AdjStackDown = TM.getRegisterInfo()->getCallFrameSetupOpcode();
+ BuildMI(MBB, DL, TII.get(AdjStackDown)).addImm(NumBytes);
+
+ // Process argument: walk the register/memloc assignments, inserting
+ // copies / loads.
+ SmallVector<unsigned, 4> RegArgs;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ unsigned Arg = Args[VA.getValNo()];
+ EVT ArgVT = ArgVTs[VA.getValNo()];
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt: {
+ bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+ Arg, ArgVT, Arg);
+ assert(Emitted && "Failed to emit a sext!"); Emitted=Emitted;
+ Emitted = true;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::ZExt: {
+ bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+ Arg, ArgVT, Arg);
+ assert(Emitted && "Failed to emit a zext!"); Emitted=Emitted;
+ Emitted = true;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::AExt: {
+ bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(),
+ Arg, ArgVT, Arg);
+ if (!Emitted)
+ Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+ Arg, ArgVT, Arg);
+ if (!Emitted)
+ Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+ Arg, ArgVT, Arg);
+
+ assert(Emitted && "Failed to emit a aext!"); Emitted=Emitted;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ case CCValAssign::BCvt: {
+ unsigned BC = FastEmit_r(ArgVT.getSimpleVT(), VA.getLocVT().getSimpleVT(),
+ ISD::BIT_CONVERT, Arg, /*TODO: Kill=*/false);
+ assert(BC != 0 && "Failed to emit a bitcast!");
+ Arg = BC;
+ ArgVT = VA.getLocVT();
+ break;
+ }
+ }
+
+ if (VA.isRegLoc()) {
+ TargetRegisterClass* RC = TLI.getRegClassFor(ArgVT);
+ bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), VA.getLocReg(),
+ Arg, RC, RC, DL);
+ assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
+ Emitted = true;
+ RegArgs.push_back(VA.getLocReg());
+ } else {
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ X86AddressMode AM;
+ AM.Base.Reg = StackPtr;
+ AM.Disp = LocMemOffset;
+ const Value *ArgVal = ArgVals[VA.getValNo()];
+
+ // If this is a really simple value, emit this with the Value* version of
+ // X86FastEmitStore. If it isn't simple, we don't want to do this, as it
+ // can cause us to reevaluate the argument.
+ if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal))
+ X86FastEmitStore(ArgVT, ArgVal, AM);
+ else
+ X86FastEmitStore(ArgVT, Arg, AM);
+ }
+ }
+
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer.
+ if (Subtarget->isPICStyleGOT()) {
+ TargetRegisterClass *RC = X86::GR32RegisterClass;
+ unsigned Base = getInstrInfo()->getGlobalBaseReg(&MF);
+ bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), X86::EBX, Base, RC, RC,
+ DL);
+ assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
+ Emitted = true;
+ }
+
+ // Issue the call.
+ MachineInstrBuilder MIB;
+ if (CalleeOp) {
+ // Register-indirect call.
+ unsigned CallOpc = Subtarget->is64Bit() ? X86::CALL64r : X86::CALL32r;
+ MIB = BuildMI(MBB, DL, TII.get(CallOpc)).addReg(CalleeOp);
+
+ } else {
+ // Direct call.
+ assert(GV && "Not a direct call");
+ unsigned CallOpc =
+ Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+
+ // See if we need any target-specific flags on the GV operand.
+ unsigned char OpFlags = 0;
+
+ // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+ // external symbols most go through the PLT in PIC mode. If the symbol
+ // has hidden or protected visibility, or if it is static or local, then
+ // we don't need to use the PLT - we can directly call it.
+ if (Subtarget->isTargetELF() &&
+ TM.getRelocationModel() == Reloc::PIC_ &&
+ GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+ OpFlags = X86II::MO_PLT;
+ } else if (Subtarget->isPICStyleStubAny() &&
+ (GV->isDeclaration() || GV->isWeakForLinker()) &&
+ Subtarget->getDarwinVers() < 9) {
+ // PC-relative references to external symbols should go through $stub,
+ // unless we're building with the leopard linker or later, which
+ // automatically synthesizes these stubs.
+ OpFlags = X86II::MO_DARWIN_STUB;
+ }
+
+
+ MIB = BuildMI(MBB, DL, TII.get(CallOpc)).addGlobalAddress(GV, 0, OpFlags);
+ }
+
+ // Add an implicit use GOT pointer in EBX.
+ if (Subtarget->isPICStyleGOT())
+ MIB.addReg(X86::EBX);
+
+ // Add implicit physical register uses to the call.
+ for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+ MIB.addReg(RegArgs[i]);
+
+ // Issue CALLSEQ_END
+ unsigned AdjStackUp = TM.getRegisterInfo()->getCallFrameDestroyOpcode();
+ BuildMI(MBB, DL, TII.get(AdjStackUp)).addImm(NumBytes).addImm(0);
+
+ // Now handle call return value (if any).
+ if (RetVT.getSimpleVT().SimpleTy != MVT::isVoid) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CC, false, TM, RVLocs, I->getParent()->getContext());
+ CCInfo.AnalyzeCallResult(RetVT, RetCC_X86);
+
+ // Copy all of the result registers out of their specified physreg.
+ assert(RVLocs.size() == 1 && "Can't handle multi-value calls!");
+ EVT CopyVT = RVLocs[0].getValVT();
+ TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
+ TargetRegisterClass *SrcRC = DstRC;
+
+ // If this is a call to a function that returns an fp value on the x87 fp
+ // stack, but where we prefer to use the value in xmm registers, copy it
+ // out as F80 and use a truncate to move it from fp stack reg to xmm reg.
+ if ((RVLocs[0].getLocReg() == X86::ST0 ||
+ RVLocs[0].getLocReg() == X86::ST1) &&
+ isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) {
+ CopyVT = MVT::f80;
+ SrcRC = X86::RSTRegisterClass;
+ DstRC = X86::RFP80RegisterClass;
+ }
+
+ unsigned ResultReg = createResultReg(DstRC);
+ bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
+ RVLocs[0].getLocReg(), DstRC, SrcRC, DL);
+ assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
+ Emitted = true;
+ if (CopyVT != RVLocs[0].getValVT()) {
+ // Round the F80 the right size, which also moves to the appropriate xmm
+ // register. This is accomplished by storing the F80 value in memory and
+ // then loading it back. Ewww...
+ EVT ResVT = RVLocs[0].getValVT();
+ unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
+ unsigned MemSize = ResVT.getSizeInBits()/8;
+ int FI = MFI.CreateStackObject(MemSize, MemSize, false);
+ addFrameReference(BuildMI(MBB, DL, TII.get(Opc)), FI).addReg(ResultReg);
+ DstRC = ResVT == MVT::f32
+ ? X86::FR32RegisterClass : X86::FR64RegisterClass;
+ Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+ ResultReg = createResultReg(DstRC);
+ addFrameReference(BuildMI(MBB, DL, TII.get(Opc), ResultReg), FI);
+ }
+
+ if (AndToI1) {
+ // Mask out all but lowest bit for some call which produces an i1.
+ unsigned AndResult = createResultReg(X86::GR8RegisterClass);
+ BuildMI(MBB, DL,
+ TII.get(X86::AND8ri), AndResult).addReg(ResultReg).addImm(1);
+ ResultReg = AndResult;
+ }
+
+ UpdateValueMap(I, ResultReg);
+ }
+
+ return true;
+}
+
+
+bool
+X86FastISel::TargetSelectInstruction(const Instruction *I) {
+ switch (I->getOpcode()) {
+ default: break;
+ case Instruction::Load:
+ return X86SelectLoad(I);
+ case Instruction::Store:
+ return X86SelectStore(I);
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ return X86SelectCmp(I);
+ case Instruction::ZExt:
+ return X86SelectZExt(I);
+ case Instruction::Br:
+ return X86SelectBranch(I);
+ case Instruction::Call:
+ return X86SelectCall(I);
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::Shl:
+ return X86SelectShift(I);
+ case Instruction::Select:
+ return X86SelectSelect(I);
+ case Instruction::Trunc:
+ return X86SelectTrunc(I);
+ case Instruction::FPExt:
+ return X86SelectFPExt(I);
+ case Instruction::FPTrunc:
+ return X86SelectFPTrunc(I);
+ case Instruction::ExtractValue:
+ return X86SelectExtractValue(I);
+ case Instruction::IntToPtr: // Deliberate fall-through.
+ case Instruction::PtrToInt: {
+ EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+ EVT DstVT = TLI.getValueType(I->getType());
+ if (DstVT.bitsGT(SrcVT))
+ return X86SelectZExt(I);
+ if (DstVT.bitsLT(SrcVT))
+ return X86SelectTrunc(I);
+ unsigned Reg = getRegForValue(I->getOperand(0));
+ if (Reg == 0) return false;
+ UpdateValueMap(I, Reg);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
+ EVT VT;
+ if (!isTypeLegal(C->getType(), VT))
+ return false;
+
+ // Get opcode and regclass of the output for the given load instruction.
+ unsigned Opc = 0;
+ const TargetRegisterClass *RC = NULL;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return false;
+ case MVT::i8:
+ Opc = X86::MOV8rm;
+ RC = X86::GR8RegisterClass;
+ break;
+ case MVT::i16:
+ Opc = X86::MOV16rm;
+ RC = X86::GR16RegisterClass;
+ break;
+ case MVT::i32:
+ Opc = X86::MOV32rm;
+ RC = X86::GR32RegisterClass;
+ break;
+ case MVT::i64:
+ // Must be in x86-64 mode.
+ Opc = X86::MOV64rm;
+ RC = X86::GR64RegisterClass;
+ break;
+ case MVT::f32:
+ if (Subtarget->hasSSE1()) {
+ Opc = X86::MOVSSrm;
+ RC = X86::FR32RegisterClass;
+ } else {
+ Opc = X86::LD_Fp32m;
+ RC = X86::RFP32RegisterClass;
+ }
+ break;
+ case MVT::f64:
+ if (Subtarget->hasSSE2()) {
+ Opc = X86::MOVSDrm;
+ RC = X86::FR64RegisterClass;
+ } else {
+ Opc = X86::LD_Fp64m;
+ RC = X86::RFP64RegisterClass;
+ }
+ break;
+ case MVT::f80:
+ // No f80 support yet.
+ return false;
+ }
+
+ // Materialize addresses with LEA instructions.
+ if (isa<GlobalValue>(C)) {
+ X86AddressMode AM;
+ if (X86SelectAddress(C, AM)) {
+ if (TLI.getPointerTy() == MVT::i32)
+ Opc = X86::LEA32r;
+ else
+ Opc = X86::LEA64r;
+ unsigned ResultReg = createResultReg(RC);
+ addLeaAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), AM);
+ return ResultReg;
+ }
+ return 0;
+ }
+
+ // MachineConstantPool wants an explicit alignment.
+ unsigned Align = TD.getPrefTypeAlignment(C->getType());
+ if (Align == 0) {
+ // Alignment of vector types. FIXME!
+ Align = TD.getTypeAllocSize(C->getType());
+ }
+
+ // x86-32 PIC requires a PIC base register for constant pools.
+ unsigned PICBase = 0;
+ unsigned char OpFlag = 0;
+ if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic
+ OpFlag = X86II::MO_PIC_BASE_OFFSET;
+ PICBase = getInstrInfo()->getGlobalBaseReg(&MF);
+ } else if (Subtarget->isPICStyleGOT()) {
+ OpFlag = X86II::MO_GOTOFF;
+ PICBase = getInstrInfo()->getGlobalBaseReg(&MF);
+ } else if (Subtarget->isPICStyleRIPRel() &&
+ TM.getCodeModel() == CodeModel::Small) {
+ PICBase = X86::RIP;
+ }
+
+ // Create the load from the constant pool.
+ unsigned MCPOffset = MCP.getConstantPoolIndex(C, Align);
+ unsigned ResultReg = createResultReg(RC);
+ addConstantPoolReference(BuildMI(MBB, DL, TII.get(Opc), ResultReg),
+ MCPOffset, PICBase, OpFlag);
+
+ return ResultReg;
+}
+
+unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) {
+ // Fail on dynamic allocas. At this point, getRegForValue has already
+ // checked its CSE maps, so if we're here trying to handle a dynamic
+ // alloca, we're not going to succeed. X86SelectAddress has a
+ // check for dynamic allocas, because it's called directly from
+ // various places, but TargetMaterializeAlloca also needs a check
+ // in order to avoid recursion between getRegForValue,
+ // X86SelectAddrss, and TargetMaterializeAlloca.
+ if (!StaticAllocaMap.count(C))
+ return 0;
+
+ X86AddressMode AM;
+ if (!X86SelectAddress(C, AM))
+ return 0;
+ unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
+ TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
+ unsigned ResultReg = createResultReg(RC);
+ addLeaAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), AM);
+ return ResultReg;
+}
+
+namespace llvm {
+ llvm::FastISel *X86::createFastISel(MachineFunction &mf,
+ DenseMap<const Value *, unsigned> &vm,
+ DenseMap<const BasicBlock *, MachineBasicBlock *> &bm,
+ DenseMap<const AllocaInst *, int> &am,
+ std::vector<std::pair<MachineInstr*, unsigned> > &pn
+#ifndef NDEBUG
+ , SmallSet<const Instruction *, 8> &cil
+#endif
+ ) {
+ return new X86FastISel(mf, vm, bm, am, pn
+#ifndef NDEBUG
+ , cil
+#endif
+ );
+ }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/X86FixupKinds.h
new file mode 100644
index 0000000..a8117d4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FixupKinds.h
@@ -0,0 +1,26 @@
+//===-- X86/X86FixupKinds.h - X86 Specific Fixup Entries --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_X86_X86FIXUPKINDS_H
+#define LLVM_X86_X86FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace X86 {
+enum Fixups {
+ reloc_pcrel_4byte = FirstTargetFixupKind, // 32-bit pcrel, e.g. a branch.
+ reloc_pcrel_1byte, // 8-bit pcrel, e.g. branch_1
+ reloc_riprel_4byte, // 32-bit rip-relative
+ reloc_riprel_4byte_movq_load // 32-bit rip-relative in movq
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
new file mode 100644
index 0000000..93460ef
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -0,0 +1,1208 @@
+//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which converts floating point instructions from
+// virtual registers into register stack instructions. This pass uses live
+// variable information to indicate where the FPn registers are used and their
+// lifetimes.
+//
+// This pass is hampered by the lack of decent CFG manipulation routines for
+// machine code. In particular, this wants to be able to split critical edges
+// as necessary, traverse the machine basic block CFG in depth-first order, and
+// allow there to be multiple machine basic blocks for each LLVM basicblock
+// (needed for critical edge splitting).
+//
+// In particular, this pass currently barfs on critical edges. Because of this,
+// it requires the instruction selector to insert FP_REG_KILL instructions on
+// the exits of any basic block that has critical edges going from it, or which
+// branch to a critical basic block.
+//
+// FIXME: this is not implemented yet. The stackifier pass only works on local
+// basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumFXCH, "Number of fxch instructions inserted");
+STATISTIC(NumFP , "Number of floating point instructions");
+
+namespace {
+ struct FPS : public MachineFunctionPass {
+ static char ID;
+ FPS() : MachineFunctionPass(&ID) {}
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const { return "X86 FP Stackifier"; }
+
+ private:
+ const TargetInstrInfo *TII; // Machine instruction info.
+ MachineBasicBlock *MBB; // Current basic block
+ unsigned Stack[8]; // FP<n> Registers in each stack slot...
+ unsigned RegMap[8]; // Track which stack slot contains each register
+ unsigned StackTop; // The current top of the FP stack.
+
+ void dumpStack() const {
+ dbgs() << "Stack contents:";
+ for (unsigned i = 0; i != StackTop; ++i) {
+ dbgs() << " FP" << Stack[i];
+ assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
+ }
+ dbgs() << "\n";
+ }
+ private:
+ /// isStackEmpty - Return true if the FP stack is empty.
+ bool isStackEmpty() const {
+ return StackTop == 0;
+ }
+
+ // getSlot - Return the stack slot number a particular register number is
+ // in.
+ unsigned getSlot(unsigned RegNo) const {
+ assert(RegNo < 8 && "Regno out of range!");
+ return RegMap[RegNo];
+ }
+
+ // getStackEntry - Return the X86::FP<n> register in register ST(i).
+ unsigned getStackEntry(unsigned STi) const {
+ assert(STi < StackTop && "Access past stack top!");
+ return Stack[StackTop-1-STi];
+ }
+
+ // getSTReg - Return the X86::ST(i) register which contains the specified
+ // FP<RegNo> register.
+ unsigned getSTReg(unsigned RegNo) const {
+ return StackTop - 1 - getSlot(RegNo) + llvm::X86::ST0;
+ }
+
+ // pushReg - Push the specified FP<n> register onto the stack.
+ void pushReg(unsigned Reg) {
+ assert(Reg < 8 && "Register number out of range!");
+ assert(StackTop < 8 && "Stack overflow!");
+ Stack[StackTop] = Reg;
+ RegMap[Reg] = StackTop++;
+ }
+
+ bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
+ void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
+ MachineInstr *MI = I;
+ DebugLoc dl = MI->getDebugLoc();
+ if (isAtTop(RegNo)) return;
+
+ unsigned STReg = getSTReg(RegNo);
+ unsigned RegOnTop = getStackEntry(0);
+
+ // Swap the slots the regs are in.
+ std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+ // Swap stack slot contents.
+ assert(RegMap[RegOnTop] < StackTop);
+ std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+
+ // Emit an fxch to update the runtime processors version of the state.
+ BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg);
+ NumFXCH++;
+ }
+
+ void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) {
+ DebugLoc dl = I->getDebugLoc();
+ unsigned STReg = getSTReg(RegNo);
+ pushReg(AsReg); // New register on top of stack
+
+ BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
+ }
+
+ // popStackAfter - Pop the current value off of the top of the FP stack
+ // after the specified instruction.
+ void popStackAfter(MachineBasicBlock::iterator &I);
+
+ // freeStackSlotAfter - Free the specified register from the register stack,
+ // so that it is no longer in a register. If the register is currently at
+ // the top of the stack, we just pop the current instruction, otherwise we
+ // store the current top-of-stack into the specified slot, then pop the top
+ // of stack.
+ void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+ bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+ void handleZeroArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+ void handleTwoArgFP(MachineBasicBlock::iterator &I);
+ void handleCompareFP(MachineBasicBlock::iterator &I);
+ void handleCondMovFP(MachineBasicBlock::iterator &I);
+ void handleSpecialFP(MachineBasicBlock::iterator &I);
+ };
+ char FPS::ID = 0;
+}
+
+FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+
+/// getFPReg - Return the X86::FPx register number for the specified operand.
+/// For example, this returns 3 for X86::FP3.
+static unsigned getFPReg(const MachineOperand &MO) {
+ assert(MO.isReg() && "Expected an FP register!");
+ unsigned Reg = MO.getReg();
+ assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
+ return Reg - X86::FP0;
+}
+
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::runOnMachineFunction(MachineFunction &MF) {
+ // We only need to run this pass if there are any FP registers used in this
+ // function. If it is all integer, there is nothing for us to do!
+ bool FPIsUsed = false;
+
+ assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!");
+ for (unsigned i = 0; i <= 6; ++i)
+ if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) {
+ FPIsUsed = true;
+ break;
+ }
+
+ // Early exit.
+ if (!FPIsUsed) return false;
+
+ TII = MF.getTarget().getInstrInfo();
+ StackTop = 0;
+
+ // Process the function in depth first order so that we process at least one
+ // of the predecessors for every reachable block in the function.
+ SmallPtrSet<MachineBasicBlock*, 8> Processed;
+ MachineBasicBlock *Entry = MF.begin();
+
+ bool Changed = false;
+ for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 8> >
+ I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed);
+ I != E; ++I)
+ Changed |= processBasicBlock(MF, **I);
+
+ // Process any unreachable blocks in arbitrary order now.
+ if (MF.size() == Processed.size())
+ return Changed;
+
+ for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+ if (Processed.insert(BB))
+ Changed |= processBasicBlock(MF, *BB);
+
+ return Changed;
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+ bool Changed = false;
+ MBB = &BB;
+
+ for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+ MachineInstr *MI = I;
+ unsigned Flags = MI->getDesc().TSFlags;
+
+ unsigned FPInstClass = Flags & X86II::FPTypeMask;
+ if (MI->isInlineAsm())
+ FPInstClass = X86II::SpecialFP;
+
+ if (FPInstClass == X86II::NotFP)
+ continue; // Efficiently ignore non-fp insts!
+
+ MachineInstr *PrevMI = 0;
+ if (I != BB.begin())
+ PrevMI = prior(I);
+
+ ++NumFP; // Keep track of # of pseudo instrs
+ DEBUG(dbgs() << "\nFPInst:\t" << *MI);
+
+ // Get dead variables list now because the MI pointer may be deleted as part
+ // of processing!
+ SmallVector<unsigned, 8> DeadRegs;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg() && MO.isDead())
+ DeadRegs.push_back(MO.getReg());
+ }
+
+ switch (FPInstClass) {
+ case X86II::ZeroArgFP: handleZeroArgFP(I); break;
+ case X86II::OneArgFP: handleOneArgFP(I); break; // fstp ST(0)
+ case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
+ case X86II::TwoArgFP: handleTwoArgFP(I); break;
+ case X86II::CompareFP: handleCompareFP(I); break;
+ case X86II::CondMovFP: handleCondMovFP(I); break;
+ case X86II::SpecialFP: handleSpecialFP(I); break;
+ default: llvm_unreachable("Unknown FP Type!");
+ }
+
+ // Check to see if any of the values defined by this instruction are dead
+ // after definition. If so, pop them.
+ for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
+ unsigned Reg = DeadRegs[i];
+ if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+ DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
+ freeStackSlotAfter(I, Reg-X86::FP0);
+ }
+ }
+
+ // Print out all of the instructions expanded to if -debug
+ DEBUG(
+ MachineBasicBlock::iterator PrevI(PrevMI);
+ if (I == PrevI) {
+ dbgs() << "Just deleted pseudo instruction\n";
+ } else {
+ MachineBasicBlock::iterator Start = I;
+ // Rewind to first instruction newly inserted.
+ while (Start != BB.begin() && prior(Start) != PrevI) --Start;
+ dbgs() << "Inserted instructions:\n\t";
+ Start->print(dbgs(), &MF.getTarget());
+ while (++Start != llvm::next(I)) {}
+ }
+ dumpStack();
+ );
+
+ Changed = true;
+ }
+
+ assert(isStackEmpty() && "Stack not empty at end of basic block?");
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+ struct TableEntry {
+ unsigned from;
+ unsigned to;
+ bool operator<(const TableEntry &TE) const { return from < TE.from; }
+ friend bool operator<(const TableEntry &TE, unsigned V) {
+ return TE.from < V;
+ }
+ friend bool operator<(unsigned V, const TableEntry &TE) {
+ return V < TE.from;
+ }
+ };
+}
+
+#ifndef NDEBUG
+static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) {
+ for (unsigned i = 0; i != NumEntries-1; ++i)
+ if (!(Table[i] < Table[i+1])) return false;
+ return true;
+}
+#endif
+
+static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) {
+ const TableEntry *I = std::lower_bound(Table, Table+N, Opcode);
+ if (I != Table+N && I->from == Opcode)
+ return I->to;
+ return -1;
+}
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE) \
+ { static bool TABLE##Checked = false; \
+ if (!TABLE##Checked) { \
+ assert(TableIsSorted(TABLE, array_lengthof(TABLE)) && \
+ "All lookup tables must be sorted for efficient access!"); \
+ TABLE##Checked = true; \
+ } \
+ }
+#endif
+
+//===----------------------------------------------------------------------===//
+// Register File -> Register Stack Mapping Methods
+//===----------------------------------------------------------------------===//
+
+// OpcodeTable - Sorted map of register instructions to their stack version.
+// The first element is an register file pseudo instruction, the second is the
+// concrete X86 instruction which uses the register stack.
+//
+static const TableEntry OpcodeTable[] = {
+ { X86::ABS_Fp32 , X86::ABS_F },
+ { X86::ABS_Fp64 , X86::ABS_F },
+ { X86::ABS_Fp80 , X86::ABS_F },
+ { X86::ADD_Fp32m , X86::ADD_F32m },
+ { X86::ADD_Fp64m , X86::ADD_F64m },
+ { X86::ADD_Fp64m32 , X86::ADD_F32m },
+ { X86::ADD_Fp80m32 , X86::ADD_F32m },
+ { X86::ADD_Fp80m64 , X86::ADD_F64m },
+ { X86::ADD_FpI16m32 , X86::ADD_FI16m },
+ { X86::ADD_FpI16m64 , X86::ADD_FI16m },
+ { X86::ADD_FpI16m80 , X86::ADD_FI16m },
+ { X86::ADD_FpI32m32 , X86::ADD_FI32m },
+ { X86::ADD_FpI32m64 , X86::ADD_FI32m },
+ { X86::ADD_FpI32m80 , X86::ADD_FI32m },
+ { X86::CHS_Fp32 , X86::CHS_F },
+ { X86::CHS_Fp64 , X86::CHS_F },
+ { X86::CHS_Fp80 , X86::CHS_F },
+ { X86::CMOVBE_Fp32 , X86::CMOVBE_F },
+ { X86::CMOVBE_Fp64 , X86::CMOVBE_F },
+ { X86::CMOVBE_Fp80 , X86::CMOVBE_F },
+ { X86::CMOVB_Fp32 , X86::CMOVB_F },
+ { X86::CMOVB_Fp64 , X86::CMOVB_F },
+ { X86::CMOVB_Fp80 , X86::CMOVB_F },
+ { X86::CMOVE_Fp32 , X86::CMOVE_F },
+ { X86::CMOVE_Fp64 , X86::CMOVE_F },
+ { X86::CMOVE_Fp80 , X86::CMOVE_F },
+ { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
+ { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
+ { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F },
+ { X86::CMOVNB_Fp32 , X86::CMOVNB_F },
+ { X86::CMOVNB_Fp64 , X86::CMOVNB_F },
+ { X86::CMOVNB_Fp80 , X86::CMOVNB_F },
+ { X86::CMOVNE_Fp32 , X86::CMOVNE_F },
+ { X86::CMOVNE_Fp64 , X86::CMOVNE_F },
+ { X86::CMOVNE_Fp80 , X86::CMOVNE_F },
+ { X86::CMOVNP_Fp32 , X86::CMOVNP_F },
+ { X86::CMOVNP_Fp64 , X86::CMOVNP_F },
+ { X86::CMOVNP_Fp80 , X86::CMOVNP_F },
+ { X86::CMOVP_Fp32 , X86::CMOVP_F },
+ { X86::CMOVP_Fp64 , X86::CMOVP_F },
+ { X86::CMOVP_Fp80 , X86::CMOVP_F },
+ { X86::COS_Fp32 , X86::COS_F },
+ { X86::COS_Fp64 , X86::COS_F },
+ { X86::COS_Fp80 , X86::COS_F },
+ { X86::DIVR_Fp32m , X86::DIVR_F32m },
+ { X86::DIVR_Fp64m , X86::DIVR_F64m },
+ { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
+ { X86::DIVR_Fp80m32 , X86::DIVR_F32m },
+ { X86::DIVR_Fp80m64 , X86::DIVR_F64m },
+ { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+ { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+ { X86::DIVR_FpI16m80, X86::DIVR_FI16m},
+ { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+ { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+ { X86::DIVR_FpI32m80, X86::DIVR_FI32m},
+ { X86::DIV_Fp32m , X86::DIV_F32m },
+ { X86::DIV_Fp64m , X86::DIV_F64m },
+ { X86::DIV_Fp64m32 , X86::DIV_F32m },
+ { X86::DIV_Fp80m32 , X86::DIV_F32m },
+ { X86::DIV_Fp80m64 , X86::DIV_F64m },
+ { X86::DIV_FpI16m32 , X86::DIV_FI16m },
+ { X86::DIV_FpI16m64 , X86::DIV_FI16m },
+ { X86::DIV_FpI16m80 , X86::DIV_FI16m },
+ { X86::DIV_FpI32m32 , X86::DIV_FI32m },
+ { X86::DIV_FpI32m64 , X86::DIV_FI32m },
+ { X86::DIV_FpI32m80 , X86::DIV_FI32m },
+ { X86::ILD_Fp16m32 , X86::ILD_F16m },
+ { X86::ILD_Fp16m64 , X86::ILD_F16m },
+ { X86::ILD_Fp16m80 , X86::ILD_F16m },
+ { X86::ILD_Fp32m32 , X86::ILD_F32m },
+ { X86::ILD_Fp32m64 , X86::ILD_F32m },
+ { X86::ILD_Fp32m80 , X86::ILD_F32m },
+ { X86::ILD_Fp64m32 , X86::ILD_F64m },
+ { X86::ILD_Fp64m64 , X86::ILD_F64m },
+ { X86::ILD_Fp64m80 , X86::ILD_F64m },
+ { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp16m80 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp32m80 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
+ { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
+ { X86::ISTT_Fp64m80 , X86::ISTT_FP64m},
+ { X86::IST_Fp16m32 , X86::IST_F16m },
+ { X86::IST_Fp16m64 , X86::IST_F16m },
+ { X86::IST_Fp16m80 , X86::IST_F16m },
+ { X86::IST_Fp32m32 , X86::IST_F32m },
+ { X86::IST_Fp32m64 , X86::IST_F32m },
+ { X86::IST_Fp32m80 , X86::IST_F32m },
+ { X86::IST_Fp64m32 , X86::IST_FP64m },
+ { X86::IST_Fp64m64 , X86::IST_FP64m },
+ { X86::IST_Fp64m80 , X86::IST_FP64m },
+ { X86::LD_Fp032 , X86::LD_F0 },
+ { X86::LD_Fp064 , X86::LD_F0 },
+ { X86::LD_Fp080 , X86::LD_F0 },
+ { X86::LD_Fp132 , X86::LD_F1 },
+ { X86::LD_Fp164 , X86::LD_F1 },
+ { X86::LD_Fp180 , X86::LD_F1 },
+ { X86::LD_Fp32m , X86::LD_F32m },
+ { X86::LD_Fp32m64 , X86::LD_F32m },
+ { X86::LD_Fp32m80 , X86::LD_F32m },
+ { X86::LD_Fp64m , X86::LD_F64m },
+ { X86::LD_Fp64m80 , X86::LD_F64m },
+ { X86::LD_Fp80m , X86::LD_F80m },
+ { X86::MUL_Fp32m , X86::MUL_F32m },
+ { X86::MUL_Fp64m , X86::MUL_F64m },
+ { X86::MUL_Fp64m32 , X86::MUL_F32m },
+ { X86::MUL_Fp80m32 , X86::MUL_F32m },
+ { X86::MUL_Fp80m64 , X86::MUL_F64m },
+ { X86::MUL_FpI16m32 , X86::MUL_FI16m },
+ { X86::MUL_FpI16m64 , X86::MUL_FI16m },
+ { X86::MUL_FpI16m80 , X86::MUL_FI16m },
+ { X86::MUL_FpI32m32 , X86::MUL_FI32m },
+ { X86::MUL_FpI32m64 , X86::MUL_FI32m },
+ { X86::MUL_FpI32m80 , X86::MUL_FI32m },
+ { X86::SIN_Fp32 , X86::SIN_F },
+ { X86::SIN_Fp64 , X86::SIN_F },
+ { X86::SIN_Fp80 , X86::SIN_F },
+ { X86::SQRT_Fp32 , X86::SQRT_F },
+ { X86::SQRT_Fp64 , X86::SQRT_F },
+ { X86::SQRT_Fp80 , X86::SQRT_F },
+ { X86::ST_Fp32m , X86::ST_F32m },
+ { X86::ST_Fp64m , X86::ST_F64m },
+ { X86::ST_Fp64m32 , X86::ST_F32m },
+ { X86::ST_Fp80m32 , X86::ST_F32m },
+ { X86::ST_Fp80m64 , X86::ST_F64m },
+ { X86::ST_FpP80m , X86::ST_FP80m },
+ { X86::SUBR_Fp32m , X86::SUBR_F32m },
+ { X86::SUBR_Fp64m , X86::SUBR_F64m },
+ { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
+ { X86::SUBR_Fp80m32 , X86::SUBR_F32m },
+ { X86::SUBR_Fp80m64 , X86::SUBR_F64m },
+ { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+ { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+ { X86::SUBR_FpI16m80, X86::SUBR_FI16m},
+ { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+ { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+ { X86::SUBR_FpI32m80, X86::SUBR_FI32m},
+ { X86::SUB_Fp32m , X86::SUB_F32m },
+ { X86::SUB_Fp64m , X86::SUB_F64m },
+ { X86::SUB_Fp64m32 , X86::SUB_F32m },
+ { X86::SUB_Fp80m32 , X86::SUB_F32m },
+ { X86::SUB_Fp80m64 , X86::SUB_F64m },
+ { X86::SUB_FpI16m32 , X86::SUB_FI16m },
+ { X86::SUB_FpI16m64 , X86::SUB_FI16m },
+ { X86::SUB_FpI16m80 , X86::SUB_FI16m },
+ { X86::SUB_FpI32m32 , X86::SUB_FI32m },
+ { X86::SUB_FpI32m64 , X86::SUB_FI32m },
+ { X86::SUB_FpI32m80 , X86::SUB_FI32m },
+ { X86::TST_Fp32 , X86::TST_F },
+ { X86::TST_Fp64 , X86::TST_F },
+ { X86::TST_Fp80 , X86::TST_F },
+ { X86::UCOM_FpIr32 , X86::UCOM_FIr },
+ { X86::UCOM_FpIr64 , X86::UCOM_FIr },
+ { X86::UCOM_FpIr80 , X86::UCOM_FIr },
+ { X86::UCOM_Fpr32 , X86::UCOM_Fr },
+ { X86::UCOM_Fpr64 , X86::UCOM_Fr },
+ { X86::UCOM_Fpr80 , X86::UCOM_Fr },
+};
+
+static unsigned getConcreteOpcode(unsigned Opcode) {
+ ASSERT_SORTED(OpcodeTable);
+ int Opc = Lookup(OpcodeTable, array_lengthof(OpcodeTable), Opcode);
+ assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
+ return Opc;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods
+//===----------------------------------------------------------------------===//
+
+// PopTable - Sorted map of instructions to their popping version. The first
+// element is an instruction, the second is the version which pops.
+//
+static const TableEntry PopTable[] = {
+ { X86::ADD_FrST0 , X86::ADD_FPrST0 },
+
+ { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
+ { X86::DIV_FrST0 , X86::DIV_FPrST0 },
+
+ { X86::IST_F16m , X86::IST_FP16m },
+ { X86::IST_F32m , X86::IST_FP32m },
+
+ { X86::MUL_FrST0 , X86::MUL_FPrST0 },
+
+ { X86::ST_F32m , X86::ST_FP32m },
+ { X86::ST_F64m , X86::ST_FP64m },
+ { X86::ST_Frr , X86::ST_FPrr },
+
+ { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
+ { X86::SUB_FrST0 , X86::SUB_FPrST0 },
+
+ { X86::UCOM_FIr , X86::UCOM_FIPr },
+
+ { X86::UCOM_FPr , X86::UCOM_FPPr },
+ { X86::UCOM_Fr , X86::UCOM_FPr },
+};
+
+/// popStackAfter - Pop the current value off of the top of the FP stack after
+/// the specified instruction. This attempts to be sneaky and combine the pop
+/// into the instruction itself if possible. The iterator is left pointing to
+/// the last instruction, be it a new pop instruction inserted, or the old
+/// instruction if it was modified in place.
+///
+void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
+ MachineInstr* MI = I;
+ DebugLoc dl = MI->getDebugLoc();
+ ASSERT_SORTED(PopTable);
+ assert(StackTop > 0 && "Cannot pop empty stack!");
+ RegMap[Stack[--StackTop]] = ~0; // Update state
+
+ // Check to see if there is a popping version of this instruction...
+ int Opcode = Lookup(PopTable, array_lengthof(PopTable), I->getOpcode());
+ if (Opcode != -1) {
+ I->setDesc(TII->get(Opcode));
+ if (Opcode == X86::UCOM_FPPr)
+ I->RemoveOperand(0);
+ } else { // Insert an explicit pop
+ I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
+ }
+}
+
+/// freeStackSlotAfter - Free the specified register from the register stack, so
+/// that it is no longer in a register. If the register is currently at the top
+/// of the stack, we just pop the current instruction, otherwise we store the
+/// current top-of-stack into the specified slot, then pop the top of stack.
+void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
+ if (getStackEntry(0) == FPRegNo) { // already at the top of stack? easy.
+ popStackAfter(I);
+ return;
+ }
+
+ // Otherwise, store the top of stack into the dead slot, killing the operand
+ // without having to add in an explicit xchg then pop.
+ //
+ unsigned STReg = getSTReg(FPRegNo);
+ unsigned OldSlot = getSlot(FPRegNo);
+ unsigned TopReg = Stack[StackTop-1];
+ Stack[OldSlot] = TopReg;
+ RegMap[TopReg] = OldSlot;
+ RegMap[FPRegNo] = ~0;
+ Stack[--StackTop] = ~0;
+ MachineInstr *MI = I;
+ DebugLoc dl = MI->getDebugLoc();
+ I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(STReg);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction transformation implementation
+//===----------------------------------------------------------------------===//
+
+/// handleZeroArgFP - ST(0) = fld0 ST(0) = flds <mem>
+///
+void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+ unsigned DestReg = getFPReg(MI->getOperand(0));
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(0); // Remove the explicit ST(0) operand
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ // Result gets pushed on the stack.
+ pushReg(DestReg);
+}
+
+/// handleOneArgFP - fst <mem>, ST(0)
+///
+void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ assert((NumOps == X86AddrNumOperands + 1 || NumOps == 1) &&
+ "Can only handle fst* & ftst instructions!");
+
+ // Is this the last use of the source register?
+ unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
+ bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+ // FISTP64m is strange because there isn't a non-popping versions.
+ // If we have one _and_ we don't want to pop the operand, duplicate the value
+ // on the stack instead of moving it. This ensure that popping the value is
+ // always ok.
+ // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m.
+ //
+ if (!KillsSrc &&
+ (MI->getOpcode() == X86::IST_Fp64m32 ||
+ MI->getOpcode() == X86::ISTT_Fp16m32 ||
+ MI->getOpcode() == X86::ISTT_Fp32m32 ||
+ MI->getOpcode() == X86::ISTT_Fp64m32 ||
+ MI->getOpcode() == X86::IST_Fp64m64 ||
+ MI->getOpcode() == X86::ISTT_Fp16m64 ||
+ MI->getOpcode() == X86::ISTT_Fp32m64 ||
+ MI->getOpcode() == X86::ISTT_Fp64m64 ||
+ MI->getOpcode() == X86::IST_Fp64m80 ||
+ MI->getOpcode() == X86::ISTT_Fp16m80 ||
+ MI->getOpcode() == X86::ISTT_Fp32m80 ||
+ MI->getOpcode() == X86::ISTT_Fp64m80 ||
+ MI->getOpcode() == X86::ST_FpP80m)) {
+ duplicateToTop(Reg, 7 /*temp register*/, I);
+ } else {
+ moveToTop(Reg, I); // Move to the top of the stack...
+ }
+
+ // Convert from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(NumOps-1); // Remove explicit ST(0) operand
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ if (MI->getOpcode() == X86::IST_FP64m ||
+ MI->getOpcode() == X86::ISTT_FP16m ||
+ MI->getOpcode() == X86::ISTT_FP32m ||
+ MI->getOpcode() == X86::ISTT_FP64m ||
+ MI->getOpcode() == X86::ST_FP80m) {
+ assert(StackTop > 0 && "Stack empty??");
+ --StackTop;
+ } else if (KillsSrc) { // Last use of operand?
+ popStackAfter(I);
+ }
+}
+
+
+/// handleOneArgFPRW: Handle instructions that read from the top of stack and
+/// replace the value with a newly computed value. These instructions may have
+/// non-fp operands after their FP operands.
+///
+/// Examples:
+/// R1 = fchs R2
+/// R1 = fadd R2, [mem]
+///
+void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+#ifndef NDEBUG
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
+#endif
+
+ // Is this the last use of the source register?
+ unsigned Reg = getFPReg(MI->getOperand(1));
+ bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+ if (KillsSrc) {
+ // If this is the last use of the source register, just make sure it's on
+ // the top of the stack.
+ moveToTop(Reg, I);
+ assert(StackTop > 0 && "Stack cannot be empty!");
+ --StackTop;
+ pushReg(getFPReg(MI->getOperand(0)));
+ } else {
+ // If this is not the last use of the source register, _copy_ it to the top
+ // of the stack.
+ duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I);
+ }
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(1); // Drop the source operand.
+ MI->RemoveOperand(0); // Drop the destination operand.
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define tables of various ways to map pseudo instructions
+//
+
+// ForwardST0Table - Map: A = B op C into: ST(0) = ST(0) op ST(i)
+static const TableEntry ForwardST0Table[] = {
+ { X86::ADD_Fp32 , X86::ADD_FST0r },
+ { X86::ADD_Fp64 , X86::ADD_FST0r },
+ { X86::ADD_Fp80 , X86::ADD_FST0r },
+ { X86::DIV_Fp32 , X86::DIV_FST0r },
+ { X86::DIV_Fp64 , X86::DIV_FST0r },
+ { X86::DIV_Fp80 , X86::DIV_FST0r },
+ { X86::MUL_Fp32 , X86::MUL_FST0r },
+ { X86::MUL_Fp64 , X86::MUL_FST0r },
+ { X86::MUL_Fp80 , X86::MUL_FST0r },
+ { X86::SUB_Fp32 , X86::SUB_FST0r },
+ { X86::SUB_Fp64 , X86::SUB_FST0r },
+ { X86::SUB_Fp80 , X86::SUB_FST0r },
+};
+
+// ReverseST0Table - Map: A = B op C into: ST(0) = ST(i) op ST(0)
+static const TableEntry ReverseST0Table[] = {
+ { X86::ADD_Fp32 , X86::ADD_FST0r }, // commutative
+ { X86::ADD_Fp64 , X86::ADD_FST0r }, // commutative
+ { X86::ADD_Fp80 , X86::ADD_FST0r }, // commutative
+ { X86::DIV_Fp32 , X86::DIVR_FST0r },
+ { X86::DIV_Fp64 , X86::DIVR_FST0r },
+ { X86::DIV_Fp80 , X86::DIVR_FST0r },
+ { X86::MUL_Fp32 , X86::MUL_FST0r }, // commutative
+ { X86::MUL_Fp64 , X86::MUL_FST0r }, // commutative
+ { X86::MUL_Fp80 , X86::MUL_FST0r }, // commutative
+ { X86::SUB_Fp32 , X86::SUBR_FST0r },
+ { X86::SUB_Fp64 , X86::SUBR_FST0r },
+ { X86::SUB_Fp80 , X86::SUBR_FST0r },
+};
+
+// ForwardSTiTable - Map: A = B op C into: ST(i) = ST(0) op ST(i)
+static const TableEntry ForwardSTiTable[] = {
+ { X86::ADD_Fp32 , X86::ADD_FrST0 }, // commutative
+ { X86::ADD_Fp64 , X86::ADD_FrST0 }, // commutative
+ { X86::ADD_Fp80 , X86::ADD_FrST0 }, // commutative
+ { X86::DIV_Fp32 , X86::DIVR_FrST0 },
+ { X86::DIV_Fp64 , X86::DIVR_FrST0 },
+ { X86::DIV_Fp80 , X86::DIVR_FrST0 },
+ { X86::MUL_Fp32 , X86::MUL_FrST0 }, // commutative
+ { X86::MUL_Fp64 , X86::MUL_FrST0 }, // commutative
+ { X86::MUL_Fp80 , X86::MUL_FrST0 }, // commutative
+ { X86::SUB_Fp32 , X86::SUBR_FrST0 },
+ { X86::SUB_Fp64 , X86::SUBR_FrST0 },
+ { X86::SUB_Fp80 , X86::SUBR_FrST0 },
+};
+
+// ReverseSTiTable - Map: A = B op C into: ST(i) = ST(i) op ST(0)
+static const TableEntry ReverseSTiTable[] = {
+ { X86::ADD_Fp32 , X86::ADD_FrST0 },
+ { X86::ADD_Fp64 , X86::ADD_FrST0 },
+ { X86::ADD_Fp80 , X86::ADD_FrST0 },
+ { X86::DIV_Fp32 , X86::DIV_FrST0 },
+ { X86::DIV_Fp64 , X86::DIV_FrST0 },
+ { X86::DIV_Fp80 , X86::DIV_FrST0 },
+ { X86::MUL_Fp32 , X86::MUL_FrST0 },
+ { X86::MUL_Fp64 , X86::MUL_FrST0 },
+ { X86::MUL_Fp80 , X86::MUL_FrST0 },
+ { X86::SUB_Fp32 , X86::SUB_FrST0 },
+ { X86::SUB_Fp64 , X86::SUB_FrST0 },
+ { X86::SUB_Fp80 , X86::SUB_FrST0 },
+};
+
+
+/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
+/// instructions which need to be simplified and possibly transformed.
+///
+/// Result: ST(0) = fsub ST(0), ST(i)
+/// ST(i) = fsub ST(0), ST(i)
+/// ST(0) = fsubr ST(0), ST(i)
+/// ST(i) = fsubr ST(0), ST(i)
+///
+void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
+ ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+ ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+ MachineInstr *MI = I;
+
+ unsigned NumOperands = MI->getDesc().getNumOperands();
+ assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
+ unsigned Dest = getFPReg(MI->getOperand(0));
+ unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+ unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+ bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+ bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+ DebugLoc dl = MI->getDebugLoc();
+
+ unsigned TOS = getStackEntry(0);
+
+ // One of our operands must be on the top of the stack. If neither is yet, we
+ // need to move one.
+ if (Op0 != TOS && Op1 != TOS) { // No operand at TOS?
+ // We can choose to move either operand to the top of the stack. If one of
+ // the operands is killed by this instruction, we want that one so that we
+ // can update right on top of the old version.
+ if (KillsOp0) {
+ moveToTop(Op0, I); // Move dead operand to TOS.
+ TOS = Op0;
+ } else if (KillsOp1) {
+ moveToTop(Op1, I);
+ TOS = Op1;
+ } else {
+ // All of the operands are live after this instruction executes, so we
+ // cannot update on top of any operand. Because of this, we must
+ // duplicate one of the stack elements to the top. It doesn't matter
+ // which one we pick.
+ //
+ duplicateToTop(Op0, Dest, I);
+ Op0 = TOS = Dest;
+ KillsOp0 = true;
+ }
+ } else if (!KillsOp0 && !KillsOp1) {
+ // If we DO have one of our operands at the top of the stack, but we don't
+ // have a dead operand, we must duplicate one of the operands to a new slot
+ // on the stack.
+ duplicateToTop(Op0, Dest, I);
+ Op0 = TOS = Dest;
+ KillsOp0 = true;
+ }
+
+ // Now we know that one of our operands is on the top of the stack, and at
+ // least one of our operands is killed by this instruction.
+ assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) &&
+ "Stack conditions not set up right!");
+
+ // We decide which form to use based on what is on the top of the stack, and
+ // which operand is killed by this instruction.
+ const TableEntry *InstTable;
+ bool isForward = TOS == Op0;
+ bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
+ if (updateST0) {
+ if (isForward)
+ InstTable = ForwardST0Table;
+ else
+ InstTable = ReverseST0Table;
+ } else {
+ if (isForward)
+ InstTable = ForwardSTiTable;
+ else
+ InstTable = ReverseSTiTable;
+ }
+
+ int Opcode = Lookup(InstTable, array_lengthof(ForwardST0Table),
+ MI->getOpcode());
+ assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
+
+ // NotTOS - The register which is not on the top of stack...
+ unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
+
+ // Replace the old instruction with a new instruction
+ MBB->remove(I++);
+ I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+
+ // If both operands are killed, pop one off of the stack in addition to
+ // overwriting the other one.
+ if (KillsOp0 && KillsOp1 && Op0 != Op1) {
+ assert(!updateST0 && "Should have updated other operand!");
+ popStackAfter(I); // Pop the top of stack
+ }
+
+ // Update stack information so that we know the destination register is now on
+ // the stack.
+ unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
+ assert(UpdatedSlot < StackTop && Dest < 7);
+ Stack[UpdatedSlot] = Dest;
+ RegMap[Dest] = UpdatedSlot;
+ MBB->getParent()->DeleteMachineInstr(MI); // Remove the old instruction
+}
+
+/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
+/// register arguments and no explicit destinations.
+///
+void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
+ ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+ ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+ MachineInstr *MI = I;
+
+ unsigned NumOperands = MI->getDesc().getNumOperands();
+ assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
+ unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+ unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+ bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+ bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+ // Make sure the first operand is on the top of stack, the other one can be
+ // anywhere.
+ moveToTop(Op0, I);
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->getOperand(0).setReg(getSTReg(Op1));
+ MI->RemoveOperand(1);
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ // If any of the operands are killed by this instruction, free them.
+ if (KillsOp0) freeStackSlotAfter(I, Op0);
+ if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+}
+
+/// handleCondMovFP - Handle two address conditional move instructions. These
+/// instructions move a st(i) register to st(0) iff a condition is true. These
+/// instructions require that the first operand is at the top of the stack, but
+/// otherwise don't modify the stack at all.
+void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+
+ unsigned Op0 = getFPReg(MI->getOperand(0));
+ unsigned Op1 = getFPReg(MI->getOperand(2));
+ bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+ // The first operand *must* be on the top of the stack.
+ moveToTop(Op0, I);
+
+ // Change the second operand to the stack register that the operand is in.
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(0);
+ MI->RemoveOperand(1);
+ MI->getOperand(0).setReg(getSTReg(Op1));
+ MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ // If we kill the second operand, make sure to pop it from the stack.
+ if (Op0 != Op1 && KillsOp1) {
+ // Get this value off of the register stack.
+ freeStackSlotAfter(I, Op1);
+ }
+}
+
+
+/// handleSpecialFP - Handle special instructions which behave unlike other
+/// floating point instructions. This is primarily intended for use by pseudo
+/// instructions.
+///
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+ DebugLoc dl = MI->getDebugLoc();
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unknown SpecialFP instruction!");
+ case X86::FpGET_ST0_32:// Appears immediately after a call returning FP type!
+ case X86::FpGET_ST0_64:// Appears immediately after a call returning FP type!
+ case X86::FpGET_ST0_80:// Appears immediately after a call returning FP type!
+ assert(StackTop == 0 && "Stack should be empty after a call!");
+ pushReg(getFPReg(MI->getOperand(0)));
+ break;
+ case X86::FpGET_ST1_32:// Appears immediately after a call returning FP type!
+ case X86::FpGET_ST1_64:// Appears immediately after a call returning FP type!
+ case X86::FpGET_ST1_80:{// Appears immediately after a call returning FP type!
+ // FpGET_ST1 should occur right after a FpGET_ST0 for a call or inline asm.
+ // The pattern we expect is:
+ // CALL
+ // FP1 = FpGET_ST0
+ // FP4 = FpGET_ST1
+ //
+ // At this point, we've pushed FP1 on the top of stack, so it should be
+ // present if it isn't dead. If it was dead, we already emitted a pop to
+ // remove it from the stack and StackTop = 0.
+
+ // Push FP4 as top of stack next.
+ pushReg(getFPReg(MI->getOperand(0)));
+
+ // If StackTop was 0 before we pushed our operand, then ST(0) must have been
+ // dead. In this case, the ST(1) value is the only thing that is live, so
+ // it should be on the TOS (after the pop that was emitted) and is. Just
+ // continue in this case.
+ if (StackTop == 1)
+ break;
+
+ // Because pushReg just pushed ST(1) as TOS, we now have to swap the two top
+ // elements so that our accounting is correct.
+ unsigned RegOnTop = getStackEntry(0);
+ unsigned RegNo = getStackEntry(1);
+
+ // Swap the slots the regs are in.
+ std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+ // Swap stack slot contents.
+ assert(RegMap[RegOnTop] < StackTop);
+ std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+ break;
+ }
+ case X86::FpSET_ST0_32:
+ case X86::FpSET_ST0_64:
+ case X86::FpSET_ST0_80: {
+ unsigned Op0 = getFPReg(MI->getOperand(0));
+
+ // FpSET_ST0_80 is generated by copyRegToReg for both function return
+ // and inline assembly with the "st" constrain. In the latter case,
+ // it is possible for ST(0) to be alive after this instruction.
+ if (!MI->killsRegister(X86::FP0 + Op0)) {
+ // Duplicate Op0
+ duplicateToTop(0, 7 /*temp register*/, I);
+ } else {
+ moveToTop(Op0, I);
+ }
+ --StackTop; // "Forget" we have something on the top of stack!
+ break;
+ }
+ case X86::FpSET_ST1_32:
+ case X86::FpSET_ST1_64:
+ case X86::FpSET_ST1_80:
+ // StackTop can be 1 if a FpSET_ST0_* was before this. Exchange them.
+ if (StackTop == 1) {
+ BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(X86::ST1);
+ NumFXCH++;
+ StackTop = 0;
+ break;
+ }
+ assert(StackTop == 2 && "Stack should have two element on it to return!");
+ --StackTop; // "Forget" we have something on the top of stack!
+ break;
+ case X86::MOV_Fp3232:
+ case X86::MOV_Fp3264:
+ case X86::MOV_Fp6432:
+ case X86::MOV_Fp6464:
+ case X86::MOV_Fp3280:
+ case X86::MOV_Fp6480:
+ case X86::MOV_Fp8032:
+ case X86::MOV_Fp8064:
+ case X86::MOV_Fp8080: {
+ const MachineOperand &MO1 = MI->getOperand(1);
+ unsigned SrcReg = getFPReg(MO1);
+
+ const MachineOperand &MO0 = MI->getOperand(0);
+ // These can be created due to inline asm. Two address pass can introduce
+ // copies from RFP registers to virtual registers.
+ if (MO0.getReg() == X86::ST0 && SrcReg == 0) {
+ assert(MO1.isKill());
+ // Treat %ST0<def> = MOV_Fp8080 %FP0<kill>
+ // like FpSET_ST0_80 %FP0<kill>, %ST0<imp-def>
+ assert((StackTop == 1 || StackTop == 2)
+ && "Stack should have one or two element on it to return!");
+ --StackTop; // "Forget" we have something on the top of stack!
+ break;
+ } else if (MO0.getReg() == X86::ST1 && SrcReg == 1) {
+ assert(MO1.isKill());
+ // Treat %ST1<def> = MOV_Fp8080 %FP1<kill>
+ // like FpSET_ST1_80 %FP0<kill>, %ST1<imp-def>
+ // StackTop can be 1 if a FpSET_ST0_* was before this. Exchange them.
+ if (StackTop == 1) {
+ BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(X86::ST1);
+ NumFXCH++;
+ StackTop = 0;
+ break;
+ }
+ assert(StackTop == 2 && "Stack should have two element on it to return!");
+ --StackTop; // "Forget" we have something on the top of stack!
+ break;
+ }
+
+ unsigned DestReg = getFPReg(MO0);
+ if (MI->killsRegister(X86::FP0+SrcReg)) {
+ // If the input operand is killed, we can just change the owner of the
+ // incoming stack slot into the result.
+ unsigned Slot = getSlot(SrcReg);
+ assert(Slot < 7 && DestReg < 7 && "FpMOV operands invalid!");
+ Stack[Slot] = DestReg;
+ RegMap[DestReg] = Slot;
+
+ } else {
+ // For FMOV we just duplicate the specified value to a new stack slot.
+ // This could be made better, but would require substantial changes.
+ duplicateToTop(SrcReg, DestReg, I);
+ }
+ }
+ break;
+ case TargetOpcode::INLINEASM: {
+ // The inline asm MachineInstr currently only *uses* FP registers for the
+ // 'f' constraint. These should be turned into the current ST(x) register
+ // in the machine instr. Also, any kills should be explicitly popped after
+ // the inline asm.
+ unsigned Kills = 0;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+ assert(Op.isUse() && "Only handle inline asm uses right now");
+
+ unsigned FPReg = getFPReg(Op);
+ Op.setReg(getSTReg(FPReg));
+
+ // If we kill this operand, make sure to pop it from the stack after the
+ // asm. We just remember it for now, and pop them all off at the end in
+ // a batch.
+ if (Op.isKill())
+ Kills |= 1U << FPReg;
+ }
+
+ // If this asm kills any FP registers (is the last use of them) we must
+ // explicitly emit pop instructions for them. Do this now after the asm has
+ // executed so that the ST(x) numbers are not off (which would happen if we
+ // did this inline with operand rewriting).
+ //
+ // Note: this might be a non-optimal pop sequence. We might be able to do
+ // better by trying to pop in stack order or something.
+ MachineBasicBlock::iterator InsertPt = MI;
+ while (Kills) {
+ unsigned FPReg = CountTrailingZeros_32(Kills);
+ freeStackSlotAfter(InsertPt, FPReg);
+ Kills &= ~(1U << FPReg);
+ }
+ // Don't delete the inline asm!
+ return;
+ }
+
+ case X86::RET:
+ case X86::RETI:
+ // If RET has an FP register use operand, pass the first one in ST(0) and
+ // the second one in ST(1).
+ if (isStackEmpty()) return; // Quick check to see if any are possible.
+
+ // Find the register operands.
+ unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U;
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+ continue;
+ // FP Register uses must be kills unless there are two uses of the same
+ // register, in which case only one will be a kill.
+ assert(Op.isUse() &&
+ (Op.isKill() || // Marked kill.
+ getFPReg(Op) == FirstFPRegOp || // Second instance.
+ MI->killsRegister(Op.getReg())) && // Later use is marked kill.
+ "Ret only defs operands, and values aren't live beyond it");
+
+ if (FirstFPRegOp == ~0U)
+ FirstFPRegOp = getFPReg(Op);
+ else {
+ assert(SecondFPRegOp == ~0U && "More than two fp operands!");
+ SecondFPRegOp = getFPReg(Op);
+ }
+
+ // Remove the operand so that later passes don't see it.
+ MI->RemoveOperand(i);
+ --i, --e;
+ }
+
+ // There are only four possibilities here:
+ // 1) we are returning a single FP value. In this case, it has to be in
+ // ST(0) already, so just declare success by removing the value from the
+ // FP Stack.
+ if (SecondFPRegOp == ~0U) {
+ // Assert that the top of stack contains the right FP register.
+ assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) &&
+ "Top of stack not the right register for RET!");
+
+ // Ok, everything is good, mark the value as not being on the stack
+ // anymore so that our assertion about the stack being empty at end of
+ // block doesn't fire.
+ StackTop = 0;
+ return;
+ }
+
+ // Otherwise, we are returning two values:
+ // 2) If returning the same value for both, we only have one thing in the FP
+ // stack. Consider: RET FP1, FP1
+ if (StackTop == 1) {
+ assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
+ "Stack misconfiguration for RET!");
+
+ // Duplicate the TOS so that we return it twice. Just pick some other FPx
+ // register to hold it.
+ unsigned NewReg = (FirstFPRegOp+1)%7;
+ duplicateToTop(FirstFPRegOp, NewReg, MI);
+ FirstFPRegOp = NewReg;
+ }
+
+ /// Okay we know we have two different FPx operands now:
+ assert(StackTop == 2 && "Must have two values live!");
+
+ /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently
+ /// in ST(1). In this case, emit an fxch.
+ if (getStackEntry(0) == SecondFPRegOp) {
+ assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live");
+ moveToTop(FirstFPRegOp, MI);
+ }
+
+ /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in
+ /// ST(1). Just remove both from our understanding of the stack and return.
+ assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live");
+ assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live");
+ StackTop = 0;
+ return;
+ }
+
+ I = MBB->erase(I); // Remove the pseudo instruction
+ --I;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPointRegKill.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPointRegKill.cpp
new file mode 100644
index 0000000..747683d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FloatingPointRegKill.cpp
@@ -0,0 +1,156 @@
+//===-- X86FloatingPoint.cpp - FP_REG_KILL inserter -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which inserts FP_REG_KILL instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/Instructions.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumFPKill, "Number of FP_REG_KILL instructions added");
+
+namespace {
+ struct FPRegKiller : public MachineFunctionPass {
+ static char ID;
+ FPRegKiller() : MachineFunctionPass(&ID) {}
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const {
+ return "X86 FP_REG_KILL inserter";
+ }
+ };
+ char FPRegKiller::ID = 0;
+}
+
+FunctionPass *llvm::createX87FPRegKillInserterPass() {
+ return new FPRegKiller();
+}
+
+/// isFPStackVReg - Return true if the specified vreg is from a fp stack
+/// register class.
+static bool isFPStackVReg(unsigned RegNo, const MachineRegisterInfo &MRI) {
+ if (!TargetRegisterInfo::isVirtualRegister(RegNo))
+ return false;
+
+ switch (MRI.getRegClass(RegNo)->getID()) {
+ default: return false;
+ case X86::RFP32RegClassID:
+ case X86::RFP64RegClassID:
+ case X86::RFP80RegClassID:
+ return true;
+ }
+}
+
+
+/// ContainsFPStackCode - Return true if the specific MBB has floating point
+/// stack code, and thus needs an FP_REG_KILL.
+static bool ContainsFPStackCode(MachineBasicBlock *MBB,
+ const MachineRegisterInfo &MRI) {
+ // Scan the block, looking for instructions that define fp stack vregs.
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+ I != E; ++I) {
+ if (I->getNumOperands() == 0 || !I->getOperand(0).isReg())
+ continue;
+
+ for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
+ if (!I->getOperand(op).isReg() || !I->getOperand(op).isDef())
+ continue;
+
+ if (isFPStackVReg(I->getOperand(op).getReg(), MRI))
+ return true;
+ }
+ }
+
+ // Check PHI nodes in successor blocks. These PHI's will be lowered to have
+ // a copy of the input value in this block, which is a definition of the
+ // value.
+ for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+ E = MBB->succ_end(); SI != E; ++ SI) {
+ MachineBasicBlock *SuccBB = *SI;
+ for (MachineBasicBlock::iterator I = SuccBB->begin(), E = SuccBB->end();
+ I != E; ++I) {
+ // All PHI nodes are at the top of the block.
+ if (!I->isPHI()) break;
+
+ if (isFPStackVReg(I->getOperand(0).getReg(), MRI))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool FPRegKiller::runOnMachineFunction(MachineFunction &MF) {
+ // If we are emitting FP stack code, scan the basic block to determine if this
+ // block defines any FP values. If so, put an FP_REG_KILL instruction before
+ // the terminator of the block.
+
+ // Note that FP stack instructions are used in all modes for long double,
+ // so we always need to do this check.
+ // Also note that it's possible for an FP stack register to be live across
+ // an instruction that produces multiple basic blocks (SSE CMOV) so we
+ // must check all the generated basic blocks.
+
+ // Scan all of the machine instructions in these MBBs, checking for FP
+ // stores. (RFP32 and RFP64 will not exist in SSE mode, but RFP80 might.)
+
+ // Fast-path: If nothing is using the x87 registers, we don't need to do
+ // any scanning.
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (MRI.getRegClassVirtRegs(X86::RFP80RegisterClass).empty() &&
+ MRI.getRegClassVirtRegs(X86::RFP64RegisterClass).empty() &&
+ MRI.getRegClassVirtRegs(X86::RFP32RegisterClass).empty())
+ return false;
+
+ bool Changed = false;
+ MachineFunction::iterator MBBI = MF.begin();
+ MachineFunction::iterator EndMBB = MF.end();
+ for (; MBBI != EndMBB; ++MBBI) {
+ MachineBasicBlock *MBB = MBBI;
+
+ // If this block returns, ignore it. We don't want to insert an FP_REG_KILL
+ // before the return.
+ if (!MBB->empty()) {
+ MachineBasicBlock::iterator EndI = MBB->end();
+ --EndI;
+ if (EndI->getDesc().isReturn())
+ continue;
+ }
+
+ // If we find any FP stack code, emit the FP_REG_KILL instruction.
+ if (ContainsFPStackCode(MBB, MRI)) {
+ BuildMI(*MBB, MBBI->getFirstTerminator(), DebugLoc(),
+ MF.getTarget().getInstrInfo()->get(X86::FP_REG_KILL));
+ ++NumFPKill;
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
new file mode 100644
index 0000000..0f64383
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -0,0 +1,2000 @@
+//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for X86,
+// converting from a legalized dag to a X86 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-isel"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+//===----------------------------------------------------------------------===//
+// Pattern Matcher Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+ /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses
+ /// SDValue's instead of register numbers for the leaves of the matched
+ /// tree.
+ struct X86ISelAddressMode {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ // This is really a union, discriminated by BaseType!
+ SDValue Base_Reg;
+ int Base_FrameIndex;
+
+ unsigned Scale;
+ SDValue IndexReg;
+ int32_t Disp;
+ SDValue Segment;
+ const GlobalValue *GV;
+ const Constant *CP;
+ const BlockAddress *BlockAddr;
+ const char *ES;
+ int JT;
+ unsigned Align; // CP alignment.
+ unsigned char SymbolFlags; // X86II::MO_*
+
+ X86ISelAddressMode()
+ : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
+ Segment(), GV(0), CP(0), BlockAddr(0), ES(0), JT(-1), Align(0),
+ SymbolFlags(X86II::MO_NO_FLAG) {
+ }
+
+ bool hasSymbolicDisplacement() const {
+ return GV != 0 || CP != 0 || ES != 0 || JT != -1 || BlockAddr != 0;
+ }
+
+ bool hasBaseOrIndexReg() const {
+ return IndexReg.getNode() != 0 || Base_Reg.getNode() != 0;
+ }
+
+ /// isRIPRelative - Return true if this addressing mode is already RIP
+ /// relative.
+ bool isRIPRelative() const {
+ if (BaseType != RegBase) return false;
+ if (RegisterSDNode *RegNode =
+ dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
+ return RegNode->getReg() == X86::RIP;
+ return false;
+ }
+
+ void setBaseReg(SDValue Reg) {
+ BaseType = RegBase;
+ Base_Reg = Reg;
+ }
+
+ void dump() {
+ dbgs() << "X86ISelAddressMode " << this << '\n';
+ dbgs() << "Base_Reg ";
+ if (Base_Reg.getNode() != 0)
+ Base_Reg.getNode()->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'
+ << " Scale" << Scale << '\n'
+ << "IndexReg ";
+ if (IndexReg.getNode() != 0)
+ IndexReg.getNode()->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << " Disp " << Disp << '\n'
+ << "GV ";
+ if (GV)
+ GV->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << " CP ";
+ if (CP)
+ CP->dump();
+ else
+ dbgs() << "nul";
+ dbgs() << '\n'
+ << "ES ";
+ if (ES)
+ dbgs() << ES;
+ else
+ dbgs() << "nul";
+ dbgs() << " JT" << JT << " Align" << Align << '\n';
+ }
+ };
+}
+
+namespace {
+ class X86ISelListener : public SelectionDAG::DAGUpdateListener {
+ SmallSet<SDNode*, 4> Deletes;
+ public:
+ explicit X86ISelListener() {}
+ virtual void NodeDeleted(SDNode *N, SDNode *E) {
+ Deletes.insert(N);
+ }
+ virtual void NodeUpdated(SDNode *N) {
+ // Ignore updates.
+ }
+ bool IsDeleted(SDNode *N) {
+ return Deletes.count(N);
+ }
+ };
+
+ //===--------------------------------------------------------------------===//
+ /// ISel - X86 specific code to select X86 machine instructions for
+ /// SelectionDAG operations.
+ ///
+ class X86DAGToDAGISel : public SelectionDAGISel {
+ /// X86Lowering - This object fully describes how to lower LLVM code to an
+ /// X86-specific SelectionDAG.
+ const X86TargetLowering &X86Lowering;
+
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ /// OptForSize - If true, selector should try to optimize for code size
+ /// instead of performance.
+ bool OptForSize;
+
+ public:
+ explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
+ : SelectionDAGISel(tm, OptLevel),
+ X86Lowering(*tm.getTargetLowering()),
+ Subtarget(&tm.getSubtarget<X86Subtarget>()),
+ OptForSize(false) {}
+
+ virtual const char *getPassName() const {
+ return "X86 DAG->DAG Instruction Selection";
+ }
+
+ virtual void EmitFunctionEntryCode();
+
+ virtual bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const;
+
+ virtual void PreprocessISelDAG();
+
+// Include the pieces autogenerated from the target description.
+#include "X86GenDAGISel.inc"
+
+ private:
+ SDNode *Select(SDNode *N);
+ SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
+ SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT);
+
+ bool MatchSegmentBaseAddress(SDValue N, X86ISelAddressMode &AM);
+ bool MatchLoad(SDValue N, X86ISelAddressMode &AM);
+ bool MatchWrapper(SDValue N, X86ISelAddressMode &AM);
+ bool MatchAddress(SDValue N, X86ISelAddressMode &AM);
+ bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+ X86ISelListener &DeadNodes,
+ unsigned Depth);
+ bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM);
+ bool SelectAddr(SDNode *Op, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+ bool SelectLEAAddr(SDNode *Op, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp);
+ bool SelectTLSADDRAddr(SDNode *Op, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index, SDValue &Disp);
+ bool SelectScalarSSELoad(SDNode *Root, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment,
+ SDValue &NodeWithChain);
+
+ bool TryFoldLoad(SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+
+ /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+ /// inline asm expressions.
+ virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+ char ConstraintCode,
+ std::vector<SDValue> &OutOps);
+
+ void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI);
+
+ inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
+ CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI.getPointerTy()) :
+ AM.Base_Reg;
+ Scale = getI8Imm(AM.Scale);
+ Index = AM.IndexReg;
+ // These are 32-bit even in 64-bit mode since RIP relative offset
+ // is 32-bit.
+ if (AM.GV)
+ Disp = CurDAG->getTargetGlobalAddress(AM.GV, MVT::i32, AM.Disp,
+ AM.SymbolFlags);
+ else if (AM.CP)
+ Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
+ AM.Align, AM.Disp, AM.SymbolFlags);
+ else if (AM.ES)
+ Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
+ else if (AM.JT != -1)
+ Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
+ else if (AM.BlockAddr)
+ Disp = CurDAG->getBlockAddress(AM.BlockAddr, MVT::i32,
+ true, AM.SymbolFlags);
+ else
+ Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32);
+
+ if (AM.Segment.getNode())
+ Segment = AM.Segment;
+ else
+ Segment = CurDAG->getRegister(0, MVT::i32);
+ }
+
+ /// getI8Imm - Return a target constant with the specified value, of type
+ /// i8.
+ inline SDValue getI8Imm(unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, MVT::i8);
+ }
+
+ /// getI16Imm - Return a target constant with the specified value, of type
+ /// i16.
+ inline SDValue getI16Imm(unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, MVT::i16);
+ }
+
+ /// getI32Imm - Return a target constant with the specified value, of type
+ /// i32.
+ inline SDValue getI32Imm(unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, MVT::i32);
+ }
+
+ /// getGlobalBaseReg - Return an SDNode that returns the value of
+ /// the global base register. Output instructions required to
+ /// initialize the global base register, if necessary.
+ ///
+ SDNode *getGlobalBaseReg();
+
+ /// getTargetMachine - Return a reference to the TargetMachine, casted
+ /// to the target-specific type.
+ const X86TargetMachine &getTargetMachine() {
+ return static_cast<const X86TargetMachine &>(TM);
+ }
+
+ /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
+ /// to the target-specific type.
+ const X86InstrInfo *getInstrInfo() {
+ return getTargetMachine().getInstrInfo();
+ }
+ };
+}
+
+
+bool
+X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
+ if (OptLevel == CodeGenOpt::None) return false;
+
+ if (!N.hasOneUse())
+ return false;
+
+ if (N.getOpcode() != ISD::LOAD)
+ return true;
+
+ // If N is a load, do additional profitability checks.
+ if (U == Root) {
+ switch (U->getOpcode()) {
+ default: break;
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::AND:
+ case X86ISD::XOR:
+ case X86ISD::OR:
+ case ISD::ADD:
+ case ISD::ADDC:
+ case ISD::ADDE:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ SDValue Op1 = U->getOperand(1);
+
+ // If the other operand is a 8-bit immediate we should fold the immediate
+ // instead. This reduces code size.
+ // e.g.
+ // movl 4(%esp), %eax
+ // addl $4, %eax
+ // vs.
+ // movl $4, %eax
+ // addl 4(%esp), %eax
+ // The former is 2 bytes shorter. In case where the increment is 1, then
+ // the saving can be 4 bytes (by using incl %eax).
+ if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
+ if (Imm->getAPIntValue().isSignedIntN(8))
+ return false;
+
+ // If the other operand is a TLS address, we should fold it instead.
+ // This produces
+ // movl %gs:0, %eax
+ // leal i@NTPOFF(%eax), %eax
+ // instead of
+ // movl $i@NTPOFF, %eax
+ // addl %gs:0, %eax
+ // if the block also has an access to a second TLS address this will save
+ // a load.
+ // FIXME: This is probably also true for non TLS addresses.
+ if (Op1.getOpcode() == X86ISD::Wrapper) {
+ SDValue Val = Op1.getOperand(0);
+ if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+/// MoveBelowCallOrigChain - Replace the original chain operand of the call with
+/// load's chain operand and move load below the call's chain operand.
+static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
+ SDValue Call, SDValue OrigChain) {
+ SmallVector<SDValue, 8> Ops;
+ SDValue Chain = OrigChain.getOperand(0);
+ if (Chain.getNode() == Load.getNode())
+ Ops.push_back(Load.getOperand(0));
+ else {
+ assert(Chain.getOpcode() == ISD::TokenFactor &&
+ "Unexpected chain operand");
+ for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
+ if (Chain.getOperand(i).getNode() == Load.getNode())
+ Ops.push_back(Load.getOperand(0));
+ else
+ Ops.push_back(Chain.getOperand(i));
+ SDValue NewChain =
+ CurDAG->getNode(ISD::TokenFactor, Load.getDebugLoc(),
+ MVT::Other, &Ops[0], Ops.size());
+ Ops.clear();
+ Ops.push_back(NewChain);
+ }
+ for (unsigned i = 1, e = OrigChain.getNumOperands(); i != e; ++i)
+ Ops.push_back(OrigChain.getOperand(i));
+ CurDAG->UpdateNodeOperands(OrigChain, &Ops[0], Ops.size());
+ CurDAG->UpdateNodeOperands(Load, Call.getOperand(0),
+ Load.getOperand(1), Load.getOperand(2));
+ Ops.clear();
+ Ops.push_back(SDValue(Load.getNode(), 1));
+ for (unsigned i = 1, e = Call.getNode()->getNumOperands(); i != e; ++i)
+ Ops.push_back(Call.getOperand(i));
+ CurDAG->UpdateNodeOperands(Call, &Ops[0], Ops.size());
+}
+
+/// isCalleeLoad - Return true if call address is a load and it can be
+/// moved below CALLSEQ_START and the chains leading up to the call.
+/// Return the CALLSEQ_START by reference as a second output.
+/// In the case of a tail call, there isn't a callseq node between the call
+/// chain and the load.
+static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
+ if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
+ return false;
+ LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
+ if (!LD ||
+ LD->isVolatile() ||
+ LD->getAddressingMode() != ISD::UNINDEXED ||
+ LD->getExtensionType() != ISD::NON_EXTLOAD)
+ return false;
+
+ // Now let's find the callseq_start.
+ while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
+ if (!Chain.hasOneUse())
+ return false;
+ Chain = Chain.getOperand(0);
+ }
+
+ if (!Chain.getNumOperands())
+ return false;
+ if (Chain.getOperand(0).getNode() == Callee.getNode())
+ return true;
+ if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
+ Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
+ Callee.getValue(1).hasOneUse())
+ return true;
+ return false;
+}
+
+void X86DAGToDAGISel::PreprocessISelDAG() {
+ // OptForSize is used in pattern predicates that isel is matching.
+ OptForSize = MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize);
+
+ for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+ E = CurDAG->allnodes_end(); I != E; ) {
+ SDNode *N = I++; // Preincrement iterator to avoid invalidation issues.
+
+ if (OptLevel != CodeGenOpt::None &&
+ (N->getOpcode() == X86ISD::CALL ||
+ N->getOpcode() == X86ISD::TC_RETURN)) {
+ /// Also try moving call address load from outside callseq_start to just
+ /// before the call to allow it to be folded.
+ ///
+ /// [Load chain]
+ /// ^
+ /// |
+ /// [Load]
+ /// ^ ^
+ /// | |
+ /// / \--
+ /// / |
+ ///[CALLSEQ_START] |
+ /// ^ |
+ /// | |
+ /// [LOAD/C2Reg] |
+ /// | |
+ /// \ /
+ /// \ /
+ /// [CALL]
+ bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
+ SDValue Chain = N->getOperand(0);
+ SDValue Load = N->getOperand(1);
+ if (!isCalleeLoad(Load, Chain, HasCallSeq))
+ continue;
+ MoveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
+ ++NumLoadMoved;
+ continue;
+ }
+
+ // Lower fpround and fpextend nodes that target the FP stack to be store and
+ // load to the stack. This is a gross hack. We would like to simply mark
+ // these as being illegal, but when we do that, legalize produces these when
+ // it expands calls, then expands these in the same legalize pass. We would
+ // like dag combine to be able to hack on these between the call expansion
+ // and the node legalization. As such this pass basically does "really
+ // late" legalization of these inline with the X86 isel pass.
+ // FIXME: This should only happen when not compiled with -O0.
+ if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
+ continue;
+
+ // If the source and destination are SSE registers, then this is a legal
+ // conversion that should not be lowered.
+ EVT SrcVT = N->getOperand(0).getValueType();
+ EVT DstVT = N->getValueType(0);
+ bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
+ bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
+ if (SrcIsSSE && DstIsSSE)
+ continue;
+
+ if (!SrcIsSSE && !DstIsSSE) {
+ // If this is an FPStack extension, it is a noop.
+ if (N->getOpcode() == ISD::FP_EXTEND)
+ continue;
+ // If this is a value-preserving FPStack truncation, it is a noop.
+ if (N->getConstantOperandVal(1))
+ continue;
+ }
+
+ // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+ // FPStack has extload and truncstore. SSE can fold direct loads into other
+ // operations. Based on this, decide what we want to do.
+ EVT MemVT;
+ if (N->getOpcode() == ISD::FP_ROUND)
+ MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+ else
+ MemVT = SrcIsSSE ? SrcVT : DstVT;
+
+ SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+ DebugLoc dl = N->getDebugLoc();
+
+ // FIXME: optimize the case where the src/dest is a load or store?
+ SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
+ N->getOperand(0),
+ MemTmp, NULL, 0, MemVT,
+ false, false, 0);
+ SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+ NULL, 0, MemVT, false, false, 0);
+
+ // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+ // extload we created. This will cause general havok on the dag because
+ // anything below the conversion could be folded into other existing nodes.
+ // To avoid invalidating 'I', back it up to the convert node.
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+
+ // Now that we did that, the node is dead. Increment the iterator to the
+ // next node to process, then delete N.
+ ++I;
+ CurDAG->DeleteNode(N);
+ }
+}
+
+
+/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
+/// the main function.
+void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
+ MachineFrameInfo *MFI) {
+ const TargetInstrInfo *TII = TM.getInstrInfo();
+ if (Subtarget->isTargetCygMing())
+ BuildMI(BB, DebugLoc(),
+ TII->get(X86::CALLpcrel32)).addExternalSymbol("__main");
+}
+
+void X86DAGToDAGISel::EmitFunctionEntryCode() {
+ // If this is main, emit special code for main.
+ if (const Function *Fn = MF->getFunction())
+ if (Fn->hasExternalLinkage() && Fn->getName() == "main")
+ EmitSpecialCodeForMain(MF->begin(), MF->getFrameInfo());
+}
+
+
+bool X86DAGToDAGISel::MatchSegmentBaseAddress(SDValue N,
+ X86ISelAddressMode &AM) {
+ assert(N.getOpcode() == X86ISD::SegmentBaseAddress);
+ SDValue Segment = N.getOperand(0);
+
+ if (AM.Segment.getNode() == 0) {
+ AM.Segment = Segment;
+ return false;
+ }
+
+ return true;
+}
+
+bool X86DAGToDAGISel::MatchLoad(SDValue N, X86ISelAddressMode &AM) {
+ // This optimization is valid because the GNU TLS model defines that
+ // gs:0 (or fs:0 on X86-64) contains its own address.
+ // For more information see http://people.redhat.com/drepper/tls.pdf
+
+ SDValue Address = N.getOperand(1);
+ if (Address.getOpcode() == X86ISD::SegmentBaseAddress &&
+ !MatchSegmentBaseAddress (Address, AM))
+ return false;
+
+ return true;
+}
+
+/// MatchWrapper - Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes
+/// into an addressing mode. These wrap things that will resolve down into a
+/// symbol reference. If no match is possible, this returns true, otherwise it
+/// returns false.
+bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
+ // If the addressing mode already has a symbol as the displacement, we can
+ // never match another symbol.
+ if (AM.hasSymbolicDisplacement())
+ return true;
+
+ SDValue N0 = N.getOperand(0);
+ CodeModel::Model M = TM.getCodeModel();
+
+ // Handle X86-64 rip-relative addresses. We check this before checking direct
+ // folding because RIP is preferable to non-RIP accesses.
+ if (Subtarget->is64Bit() &&
+ // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
+ // they cannot be folded into immediate fields.
+ // FIXME: This can be improved for kernel and other models?
+ (M == CodeModel::Small || M == CodeModel::Kernel) &&
+ // Base and index reg must be 0 in order to use %rip as base and lowering
+ // must allow RIP.
+ !AM.hasBaseOrIndexReg() && N.getOpcode() == X86ISD::WrapperRIP) {
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ int64_t Offset = AM.Disp + G->getOffset();
+ if (!X86::isOffsetSuitableForCodeModel(Offset, M)) return true;
+ AM.GV = G->getGlobal();
+ AM.Disp = Offset;
+ AM.SymbolFlags = G->getTargetFlags();
+ } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ int64_t Offset = AM.Disp + CP->getOffset();
+ if (!X86::isOffsetSuitableForCodeModel(Offset, M)) return true;
+ AM.CP = CP->getConstVal();
+ AM.Align = CP->getAlignment();
+ AM.Disp = Offset;
+ AM.SymbolFlags = CP->getTargetFlags();
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+ AM.ES = S->getSymbol();
+ AM.SymbolFlags = S->getTargetFlags();
+ } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+ AM.JT = J->getIndex();
+ AM.SymbolFlags = J->getTargetFlags();
+ } else {
+ AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress();
+ AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags();
+ }
+
+ if (N.getOpcode() == X86ISD::WrapperRIP)
+ AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
+ return false;
+ }
+
+ // Handle the case when globals fit in our immediate field: This is true for
+ // X86-32 always and X86-64 when in -static -mcmodel=small mode. In 64-bit
+ // mode, this results in a non-RIP-relative computation.
+ if (!Subtarget->is64Bit() ||
+ ((M == CodeModel::Small || M == CodeModel::Kernel) &&
+ TM.getRelocationModel() == Reloc::Static)) {
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ AM.GV = G->getGlobal();
+ AM.Disp += G->getOffset();
+ AM.SymbolFlags = G->getTargetFlags();
+ } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ AM.CP = CP->getConstVal();
+ AM.Align = CP->getAlignment();
+ AM.Disp += CP->getOffset();
+ AM.SymbolFlags = CP->getTargetFlags();
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+ AM.ES = S->getSymbol();
+ AM.SymbolFlags = S->getTargetFlags();
+ } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+ AM.JT = J->getIndex();
+ AM.SymbolFlags = J->getTargetFlags();
+ } else {
+ AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress();
+ AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags();
+ }
+ return false;
+ }
+
+ return true;
+}
+
+/// MatchAddress - Add the specified node to the specified addressing mode,
+/// returning true if it cannot be done. This just pattern matches for the
+/// addressing mode.
+bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
+ X86ISelListener DeadNodes;
+ if (MatchAddressRecursively(N, AM, DeadNodes, 0))
+ return true;
+
+ // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
+ // a smaller encoding and avoids a scaled-index.
+ if (AM.Scale == 2 &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == 0) {
+ AM.Base_Reg = AM.IndexReg;
+ AM.Scale = 1;
+ }
+
+ // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
+ // because it has a smaller encoding.
+ // TODO: Which other code models can use this?
+ if (TM.getCodeModel() == CodeModel::Small &&
+ Subtarget->is64Bit() &&
+ AM.Scale == 1 &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == 0 &&
+ AM.IndexReg.getNode() == 0 &&
+ AM.SymbolFlags == X86II::MO_NO_FLAG &&
+ AM.hasSymbolicDisplacement())
+ AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
+
+ return false;
+}
+
+/// isLogicallyAddWithConstant - Return true if this node is semantically an
+/// add of a value with a constantint.
+static bool isLogicallyAddWithConstant(SDValue V, SelectionDAG *CurDAG) {
+ // Check for (add x, Cst)
+ if (V->getOpcode() == ISD::ADD)
+ return isa<ConstantSDNode>(V->getOperand(1));
+
+ // Check for (or x, Cst), where Cst & x == 0.
+ if (V->getOpcode() != ISD::OR ||
+ !isa<ConstantSDNode>(V->getOperand(1)))
+ return false;
+
+ // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+ ConstantSDNode *CN = cast<ConstantSDNode>(V->getOperand(1));
+
+ // Check to see if the LHS & C is zero.
+ return CurDAG->MaskedValueIsZero(V->getOperand(0), CN->getAPIntValue());
+}
+
+bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+ X86ISelListener &DeadNodes,
+ unsigned Depth) {
+ bool is64Bit = Subtarget->is64Bit();
+ DebugLoc dl = N.getDebugLoc();
+ DEBUG({
+ dbgs() << "MatchAddress: ";
+ AM.dump();
+ });
+ // Limit recursion.
+ if (Depth > 5)
+ return MatchAddressBase(N, AM);
+
+ CodeModel::Model M = TM.getCodeModel();
+
+ // If this is already a %rip relative address, we can only merge immediates
+ // into it. Instead of handling this in every case, we handle it here.
+ // RIP relative addressing: %rip + 32-bit displacement!
+ if (AM.isRIPRelative()) {
+ // FIXME: JumpTable and ExternalSymbol address currently don't like
+ // displacements. It isn't very important, but this should be fixed for
+ // consistency.
+ if (!AM.ES && AM.JT != -1) return true;
+
+ if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N)) {
+ int64_t Val = AM.Disp + Cst->getSExtValue();
+ if (X86::isOffsetSuitableForCodeModel(Val, M,
+ AM.hasSymbolicDisplacement())) {
+ AM.Disp = Val;
+ return false;
+ }
+ }
+ return true;
+ }
+
+ switch (N.getOpcode()) {
+ default: break;
+ case ISD::Constant: {
+ uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+ if (!is64Bit ||
+ X86::isOffsetSuitableForCodeModel(AM.Disp + Val, M,
+ AM.hasSymbolicDisplacement())) {
+ AM.Disp += Val;
+ return false;
+ }
+ break;
+ }
+
+ case X86ISD::SegmentBaseAddress:
+ if (!MatchSegmentBaseAddress(N, AM))
+ return false;
+ break;
+
+ case X86ISD::Wrapper:
+ case X86ISD::WrapperRIP:
+ if (!MatchWrapper(N, AM))
+ return false;
+ break;
+
+ case ISD::LOAD:
+ if (!MatchLoad(N, AM))
+ return false;
+ break;
+
+ case ISD::FrameIndex:
+ if (AM.BaseType == X86ISelAddressMode::RegBase
+ && AM.Base_Reg.getNode() == 0) {
+ AM.BaseType = X86ISelAddressMode::FrameIndexBase;
+ AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+ return false;
+ }
+ break;
+
+ case ISD::SHL:
+ if (AM.IndexReg.getNode() != 0 || AM.Scale != 1)
+ break;
+
+ if (ConstantSDNode
+ *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+ unsigned Val = CN->getZExtValue();
+ // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
+ // that the base operand remains free for further matching. If
+ // the base doesn't end up getting used, a post-processing step
+ // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
+ if (Val == 1 || Val == 2 || Val == 3) {
+ AM.Scale = 1 << Val;
+ SDValue ShVal = N.getNode()->getOperand(0);
+
+ // Okay, we know that we have a scale by now. However, if the scaled
+ // value is an add of something and a constant, we can fold the
+ // constant into the disp field here.
+ if (isLogicallyAddWithConstant(ShVal, CurDAG)) {
+ AM.IndexReg = ShVal.getNode()->getOperand(0);
+ ConstantSDNode *AddVal =
+ cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+ uint64_t Disp = AM.Disp + (AddVal->getSExtValue() << Val);
+ if (!is64Bit ||
+ X86::isOffsetSuitableForCodeModel(Disp, M,
+ AM.hasSymbolicDisplacement()))
+ AM.Disp = Disp;
+ else
+ AM.IndexReg = ShVal;
+ } else {
+ AM.IndexReg = ShVal;
+ }
+ return false;
+ }
+ break;
+ }
+
+ case ISD::SMUL_LOHI:
+ case ISD::UMUL_LOHI:
+ // A mul_lohi where we need the low part can be folded as a plain multiply.
+ if (N.getResNo() != 0) break;
+ // FALL THROUGH
+ case ISD::MUL:
+ case X86ISD::MUL_IMM:
+ // X*[3,5,9] -> X+X*[2,4,8]
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() == 0 &&
+ AM.IndexReg.getNode() == 0) {
+ if (ConstantSDNode
+ *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+ if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
+ CN->getZExtValue() == 9) {
+ AM.Scale = unsigned(CN->getZExtValue())-1;
+
+ SDValue MulVal = N.getNode()->getOperand(0);
+ SDValue Reg;
+
+ // Okay, we know that we have a scale by now. However, if the scaled
+ // value is an add of something and a constant, we can fold the
+ // constant into the disp field here.
+ if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
+ isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
+ Reg = MulVal.getNode()->getOperand(0);
+ ConstantSDNode *AddVal =
+ cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+ uint64_t Disp = AM.Disp + AddVal->getSExtValue() *
+ CN->getZExtValue();
+ if (!is64Bit ||
+ X86::isOffsetSuitableForCodeModel(Disp, M,
+ AM.hasSymbolicDisplacement()))
+ AM.Disp = Disp;
+ else
+ Reg = N.getNode()->getOperand(0);
+ } else {
+ Reg = N.getNode()->getOperand(0);
+ }
+
+ AM.IndexReg = AM.Base_Reg = Reg;
+ return false;
+ }
+ }
+ break;
+
+ case ISD::SUB: {
+ // Given A-B, if A can be completely folded into the address and
+ // the index field with the index field unused, use -B as the index.
+ // This is a win if a has multiple parts that can be folded into
+ // the address. Also, this saves a mov if the base register has
+ // other uses, since it avoids a two-address sub instruction, however
+ // it costs an additional mov if the index register has other uses.
+
+ // Test if the LHS of the sub can be folded.
+ X86ISelAddressMode Backup = AM;
+ if (MatchAddressRecursively(N.getNode()->getOperand(0), AM,
+ DeadNodes, Depth+1) ||
+ // If it is successful but the recursive update causes N to be deleted,
+ // then it's not safe to continue.
+ DeadNodes.IsDeleted(N.getNode())) {
+ AM = Backup;
+ break;
+ }
+ // Test if the index field is free for use.
+ if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
+ AM = Backup;
+ break;
+ }
+
+ int Cost = 0;
+ SDValue RHS = N.getNode()->getOperand(1);
+ // If the RHS involves a register with multiple uses, this
+ // transformation incurs an extra mov, due to the neg instruction
+ // clobbering its operand.
+ if (!RHS.getNode()->hasOneUse() ||
+ RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
+ RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
+ RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
+ (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
+ RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+ ++Cost;
+ // If the base is a register with multiple uses, this
+ // transformation may save a mov.
+ if ((AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base_Reg.getNode() &&
+ !AM.Base_Reg.getNode()->hasOneUse()) ||
+ AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ --Cost;
+ // If the folded LHS was interesting, this transformation saves
+ // address arithmetic.
+ if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
+ ((AM.Disp != 0) && (Backup.Disp == 0)) +
+ (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
+ --Cost;
+ // If it doesn't look like it may be an overall win, don't do it.
+ if (Cost >= 0) {
+ AM = Backup;
+ break;
+ }
+
+ // Ok, the transformation is legal and appears profitable. Go for it.
+ SDValue Zero = CurDAG->getConstant(0, N.getValueType());
+ SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
+ AM.IndexReg = Neg;
+ AM.Scale = 1;
+
+ // Insert the new nodes into the topological ordering.
+ if (Zero.getNode()->getNodeId() == -1 ||
+ Zero.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(N.getNode(), Zero.getNode());
+ Zero.getNode()->setNodeId(N.getNode()->getNodeId());
+ }
+ if (Neg.getNode()->getNodeId() == -1 ||
+ Neg.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(N.getNode(), Neg.getNode());
+ Neg.getNode()->setNodeId(N.getNode()->getNodeId());
+ }
+ return false;
+ }
+
+ case ISD::ADD: {
+ X86ISelAddressMode Backup = AM;
+ if (!MatchAddressRecursively(N.getNode()->getOperand(0), AM,
+ DeadNodes, Depth+1)) {
+ if (DeadNodes.IsDeleted(N.getNode()))
+ // If it is successful but the recursive update causes N to be deleted,
+ // then it's not safe to continue.
+ return true;
+ if (!MatchAddressRecursively(N.getNode()->getOperand(1), AM,
+ DeadNodes, Depth+1))
+ // If it is successful but the recursive update causes N to be deleted,
+ // then it's not safe to continue.
+ return DeadNodes.IsDeleted(N.getNode());
+ }
+
+ // Try again after commuting the operands.
+ AM = Backup;
+ if (!MatchAddressRecursively(N.getNode()->getOperand(1), AM,
+ DeadNodes, Depth+1)) {
+ if (DeadNodes.IsDeleted(N.getNode()))
+ // If it is successful but the recursive update causes N to be deleted,
+ // then it's not safe to continue.
+ return true;
+ if (!MatchAddressRecursively(N.getNode()->getOperand(0), AM,
+ DeadNodes, Depth+1))
+ // If it is successful but the recursive update causes N to be deleted,
+ // then it's not safe to continue.
+ return DeadNodes.IsDeleted(N.getNode());
+ }
+ AM = Backup;
+
+ // If we couldn't fold both operands into the address at the same time,
+ // see if we can just put each operand into a register and fold at least
+ // the add.
+ if (AM.BaseType == X86ISelAddressMode::RegBase &&
+ !AM.Base_Reg.getNode() &&
+ !AM.IndexReg.getNode()) {
+ AM.Base_Reg = N.getNode()->getOperand(0);
+ AM.IndexReg = N.getNode()->getOperand(1);
+ AM.Scale = 1;
+ return false;
+ }
+ break;
+ }
+
+ case ISD::OR:
+ // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+ if (isLogicallyAddWithConstant(N, CurDAG)) {
+ X86ISelAddressMode Backup = AM;
+ ConstantSDNode *CN = cast<ConstantSDNode>(N.getOperand(1));
+ uint64_t Offset = CN->getSExtValue();
+
+ // Start with the LHS as an addr mode.
+ if (!MatchAddressRecursively(N.getOperand(0), AM, DeadNodes, Depth+1) &&
+ // Address could not have picked a GV address for the displacement.
+ AM.GV == NULL &&
+ // On x86-64, the resultant disp must fit in 32-bits.
+ (!is64Bit ||
+ X86::isOffsetSuitableForCodeModel(AM.Disp + Offset, M,
+ AM.hasSymbolicDisplacement()))) {
+ AM.Disp += Offset;
+ return false;
+ }
+ AM = Backup;
+ }
+ break;
+
+ case ISD::AND: {
+ // Perform some heroic transforms on an and of a constant-count shift
+ // with a constant to enable use of the scaled offset field.
+
+ SDValue Shift = N.getOperand(0);
+ if (Shift.getNumOperands() != 2) break;
+
+ // Scale must not be used already.
+ if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+
+ SDValue X = Shift.getOperand(0);
+ ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+ if (!C1 || !C2) break;
+
+ // Handle "(X >> (8-C1)) & C2" as "(X >> 8) & 0xff)" if safe. This
+ // allows us to convert the shift and and into an h-register extract and
+ // a scaled index.
+ if (Shift.getOpcode() == ISD::SRL && Shift.hasOneUse()) {
+ unsigned ScaleLog = 8 - C1->getZExtValue();
+ if (ScaleLog > 0 && ScaleLog < 4 &&
+ C2->getZExtValue() == (UINT64_C(0xff) << ScaleLog)) {
+ SDValue Eight = CurDAG->getConstant(8, MVT::i8);
+ SDValue Mask = CurDAG->getConstant(0xff, N.getValueType());
+ SDValue Srl = CurDAG->getNode(ISD::SRL, dl, N.getValueType(),
+ X, Eight);
+ SDValue And = CurDAG->getNode(ISD::AND, dl, N.getValueType(),
+ Srl, Mask);
+ SDValue ShlCount = CurDAG->getConstant(ScaleLog, MVT::i8);
+ SDValue Shl = CurDAG->getNode(ISD::SHL, dl, N.getValueType(),
+ And, ShlCount);
+
+ // Insert the new nodes into the topological ordering.
+ if (Eight.getNode()->getNodeId() == -1 ||
+ Eight.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(X.getNode(), Eight.getNode());
+ Eight.getNode()->setNodeId(X.getNode()->getNodeId());
+ }
+ if (Mask.getNode()->getNodeId() == -1 ||
+ Mask.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(X.getNode(), Mask.getNode());
+ Mask.getNode()->setNodeId(X.getNode()->getNodeId());
+ }
+ if (Srl.getNode()->getNodeId() == -1 ||
+ Srl.getNode()->getNodeId() > Shift.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(Shift.getNode(), Srl.getNode());
+ Srl.getNode()->setNodeId(Shift.getNode()->getNodeId());
+ }
+ if (And.getNode()->getNodeId() == -1 ||
+ And.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(N.getNode(), And.getNode());
+ And.getNode()->setNodeId(N.getNode()->getNodeId());
+ }
+ if (ShlCount.getNode()->getNodeId() == -1 ||
+ ShlCount.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(X.getNode(), ShlCount.getNode());
+ ShlCount.getNode()->setNodeId(N.getNode()->getNodeId());
+ }
+ if (Shl.getNode()->getNodeId() == -1 ||
+ Shl.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(N.getNode(), Shl.getNode());
+ Shl.getNode()->setNodeId(N.getNode()->getNodeId());
+ }
+ CurDAG->ReplaceAllUsesWith(N, Shl, &DeadNodes);
+ AM.IndexReg = And;
+ AM.Scale = (1 << ScaleLog);
+ return false;
+ }
+ }
+
+ // Handle "(X << C1) & C2" as "(X & (C2>>C1)) << C1" if safe and if this
+ // allows us to fold the shift into this addressing mode.
+ if (Shift.getOpcode() != ISD::SHL) break;
+
+ // Not likely to be profitable if either the AND or SHIFT node has more
+ // than one use (unless all uses are for address computation). Besides,
+ // isel mechanism requires their node ids to be reused.
+ if (!N.hasOneUse() || !Shift.hasOneUse())
+ break;
+
+ // Verify that the shift amount is something we can fold.
+ unsigned ShiftCst = C1->getZExtValue();
+ if (ShiftCst != 1 && ShiftCst != 2 && ShiftCst != 3)
+ break;
+
+ // Get the new AND mask, this folds to a constant.
+ SDValue NewANDMask = CurDAG->getNode(ISD::SRL, dl, N.getValueType(),
+ SDValue(C2, 0), SDValue(C1, 0));
+ SDValue NewAND = CurDAG->getNode(ISD::AND, dl, N.getValueType(), X,
+ NewANDMask);
+ SDValue NewSHIFT = CurDAG->getNode(ISD::SHL, dl, N.getValueType(),
+ NewAND, SDValue(C1, 0));
+
+ // Insert the new nodes into the topological ordering.
+ if (C1->getNodeId() > X.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(X.getNode(), C1);
+ C1->setNodeId(X.getNode()->getNodeId());
+ }
+ if (NewANDMask.getNode()->getNodeId() == -1 ||
+ NewANDMask.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(X.getNode(), NewANDMask.getNode());
+ NewANDMask.getNode()->setNodeId(X.getNode()->getNodeId());
+ }
+ if (NewAND.getNode()->getNodeId() == -1 ||
+ NewAND.getNode()->getNodeId() > Shift.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(Shift.getNode(), NewAND.getNode());
+ NewAND.getNode()->setNodeId(Shift.getNode()->getNodeId());
+ }
+ if (NewSHIFT.getNode()->getNodeId() == -1 ||
+ NewSHIFT.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+ CurDAG->RepositionNode(N.getNode(), NewSHIFT.getNode());
+ NewSHIFT.getNode()->setNodeId(N.getNode()->getNodeId());
+ }
+
+ CurDAG->ReplaceAllUsesWith(N, NewSHIFT, &DeadNodes);
+
+ AM.Scale = 1 << ShiftCst;
+ AM.IndexReg = NewAND;
+ return false;
+ }
+ }
+
+ return MatchAddressBase(N, AM);
+}
+
+/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
+ // Is the base register already occupied?
+ if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
+ // If so, check to see if the scale index register is set.
+ if (AM.IndexReg.getNode() == 0) {
+ AM.IndexReg = N;
+ AM.Scale = 1;
+ return false;
+ }
+
+ // Otherwise, we cannot select it.
+ return true;
+ }
+
+ // Default, generate it as a register.
+ AM.BaseType = X86ISelAddressMode::RegBase;
+ AM.Base_Reg = N;
+ return false;
+}
+
+/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+bool X86DAGToDAGISel::SelectAddr(SDNode *Op, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
+ X86ISelAddressMode AM;
+ if (MatchAddress(N, AM))
+ return false;
+
+ EVT VT = N.getValueType();
+ if (AM.BaseType == X86ISelAddressMode::RegBase) {
+ if (!AM.Base_Reg.getNode())
+ AM.Base_Reg = CurDAG->getRegister(0, VT);
+ }
+
+ if (!AM.IndexReg.getNode())
+ AM.IndexReg = CurDAG->getRegister(0, VT);
+
+ getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+/// SelectScalarSSELoad - Match a scalar SSE load. In particular, we want to
+/// match a load whose top elements are either undef or zeros. The load flavor
+/// is derived from the type of N, which is either v4f32 or v2f64.
+///
+/// We also return:
+/// PatternChainNode: this is the matched node that has a chain input and
+/// output.
+bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
+ SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment,
+ SDValue &PatternNodeWithChain) {
+ if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ PatternNodeWithChain = N.getOperand(0);
+ if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
+ PatternNodeWithChain.hasOneUse() &&
+ IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+ IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
+ LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+ if (!SelectAddr(Root, LD->getBasePtr(), Base, Scale, Index, Disp,Segment))
+ return false;
+ return true;
+ }
+ }
+
+ // Also handle the case where we explicitly require zeros in the top
+ // elements. This is a vector shuffle from the zero vector.
+ if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
+ // Check to see if the top elements are all zeros (or bitcast of zeros).
+ N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ N.getOperand(0).getNode()->hasOneUse() &&
+ ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) &&
+ N.getOperand(0).getOperand(0).hasOneUse() &&
+ IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+ IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
+ // Okay, this is a zero extending load. Fold it.
+ LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
+ if (!SelectAddr(Root, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+ return false;
+ PatternNodeWithChain = SDValue(LD, 0);
+ return true;
+ }
+ return false;
+}
+
+
+/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LEA instruction.
+bool X86DAGToDAGISel::SelectLEAAddr(SDNode *Op, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp) {
+ X86ISelAddressMode AM;
+
+ // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
+ // segments.
+ SDValue Copy = AM.Segment;
+ SDValue T = CurDAG->getRegister(0, MVT::i32);
+ AM.Segment = T;
+ if (MatchAddress(N, AM))
+ return false;
+ assert (T == AM.Segment);
+ AM.Segment = Copy;
+
+ EVT VT = N.getValueType();
+ unsigned Complexity = 0;
+ if (AM.BaseType == X86ISelAddressMode::RegBase)
+ if (AM.Base_Reg.getNode())
+ Complexity = 1;
+ else
+ AM.Base_Reg = CurDAG->getRegister(0, VT);
+ else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ Complexity = 4;
+
+ if (AM.IndexReg.getNode())
+ Complexity++;
+ else
+ AM.IndexReg = CurDAG->getRegister(0, VT);
+
+ // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+ // a simple shift.
+ if (AM.Scale > 1)
+ Complexity++;
+
+ // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
+ // to a LEA. This is determined with some expermentation but is by no means
+ // optimal (especially for code size consideration). LEA is nice because of
+ // its three-address nature. Tweak the cost function again when we can run
+ // convertToThreeAddress() at register allocation time.
+ if (AM.hasSymbolicDisplacement()) {
+ // For X86-64, we should always use lea to materialize RIP relative
+ // addresses.
+ if (Subtarget->is64Bit())
+ Complexity = 4;
+ else
+ Complexity += 2;
+ }
+
+ if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode()))
+ Complexity++;
+
+ // If it isn't worth using an LEA, reject it.
+ if (Complexity <= 2)
+ return false;
+
+ SDValue Segment;
+ getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+/// SelectTLSADDRAddr - This is only run on TargetGlobalTLSAddress nodes.
+bool X86DAGToDAGISel::SelectTLSADDRAddr(SDNode *Op, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp) {
+ assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
+ const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+
+ X86ISelAddressMode AM;
+ AM.GV = GA->getGlobal();
+ AM.Disp += GA->getOffset();
+ AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
+ AM.SymbolFlags = GA->getTargetFlags();
+
+ if (N.getValueType() == MVT::i32) {
+ AM.Scale = 1;
+ AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
+ } else {
+ AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
+ }
+
+ SDValue Segment;
+ getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+ return true;
+}
+
+
+bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ if (!ISD::isNON_EXTLoad(N.getNode()) ||
+ !IsProfitableToFold(N, P, P) ||
+ !IsLegalToFold(N, P, P, OptLevel))
+ return false;
+
+ return SelectAddr(P, N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
+/// getGlobalBaseReg - Return an SDNode that returns the value of
+/// the global base register. Output instructions required to
+/// initialize the global base register, if necessary.
+///
+SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+ unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
+ return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+}
+
+static SDNode *FindCallStartFromCall(SDNode *Node) {
+ if (Node->getOpcode() == ISD::CALLSEQ_START) return Node;
+ assert(Node->getOperand(0).getValueType() == MVT::Other &&
+ "Node doesn't have a token chain argument!");
+ return FindCallStartFromCall(Node->getOperand(0).getNode());
+}
+
+SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
+ SDValue Chain = Node->getOperand(0);
+ SDValue In1 = Node->getOperand(1);
+ SDValue In2L = Node->getOperand(2);
+ SDValue In2H = Node->getOperand(3);
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (!SelectAddr(In1.getNode(), In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+ return NULL;
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+ const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain};
+ SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+ MVT::i32, MVT::i32, MVT::Other, Ops,
+ array_lengthof(Ops));
+ cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
+ return ResNode;
+}
+
+SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
+ if (Node->hasAnyUseOfValue(0))
+ return 0;
+
+ // Optimize common patterns for __sync_add_and_fetch and
+ // __sync_sub_and_fetch where the result is not used. This allows us
+ // to use "lock" version of add, sub, inc, dec instructions.
+ // FIXME: Do not use special instructions but instead add the "lock"
+ // prefix to the target node somehow. The extra information will then be
+ // transferred to machine instruction and it denotes the prefix.
+ SDValue Chain = Node->getOperand(0);
+ SDValue Ptr = Node->getOperand(1);
+ SDValue Val = Node->getOperand(2);
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (!SelectAddr(Ptr.getNode(), Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+ return 0;
+
+ bool isInc = false, isDec = false, isSub = false, isCN = false;
+ ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val);
+ if (CN) {
+ isCN = true;
+ int64_t CNVal = CN->getSExtValue();
+ if (CNVal == 1)
+ isInc = true;
+ else if (CNVal == -1)
+ isDec = true;
+ else if (CNVal >= 0)
+ Val = CurDAG->getTargetConstant(CNVal, NVT);
+ else {
+ isSub = true;
+ Val = CurDAG->getTargetConstant(-CNVal, NVT);
+ }
+ } else if (Val.hasOneUse() &&
+ Val.getOpcode() == ISD::SUB &&
+ X86::isZeroNode(Val.getOperand(0))) {
+ isSub = true;
+ Val = Val.getOperand(1);
+ }
+
+ unsigned Opc = 0;
+ switch (NVT.getSimpleVT().SimpleTy) {
+ default: return 0;
+ case MVT::i8:
+ if (isInc)
+ Opc = X86::LOCK_INC8m;
+ else if (isDec)
+ Opc = X86::LOCK_DEC8m;
+ else if (isSub) {
+ if (isCN)
+ Opc = X86::LOCK_SUB8mi;
+ else
+ Opc = X86::LOCK_SUB8mr;
+ } else {
+ if (isCN)
+ Opc = X86::LOCK_ADD8mi;
+ else
+ Opc = X86::LOCK_ADD8mr;
+ }
+ break;
+ case MVT::i16:
+ if (isInc)
+ Opc = X86::LOCK_INC16m;
+ else if (isDec)
+ Opc = X86::LOCK_DEC16m;
+ else if (isSub) {
+ if (isCN) {
+ if (Predicate_immSext8(Val.getNode()))
+ Opc = X86::LOCK_SUB16mi8;
+ else
+ Opc = X86::LOCK_SUB16mi;
+ } else
+ Opc = X86::LOCK_SUB16mr;
+ } else {
+ if (isCN) {
+ if (Predicate_immSext8(Val.getNode()))
+ Opc = X86::LOCK_ADD16mi8;
+ else
+ Opc = X86::LOCK_ADD16mi;
+ } else
+ Opc = X86::LOCK_ADD16mr;
+ }
+ break;
+ case MVT::i32:
+ if (isInc)
+ Opc = X86::LOCK_INC32m;
+ else if (isDec)
+ Opc = X86::LOCK_DEC32m;
+ else if (isSub) {
+ if (isCN) {
+ if (Predicate_immSext8(Val.getNode()))
+ Opc = X86::LOCK_SUB32mi8;
+ else
+ Opc = X86::LOCK_SUB32mi;
+ } else
+ Opc = X86::LOCK_SUB32mr;
+ } else {
+ if (isCN) {
+ if (Predicate_immSext8(Val.getNode()))
+ Opc = X86::LOCK_ADD32mi8;
+ else
+ Opc = X86::LOCK_ADD32mi;
+ } else
+ Opc = X86::LOCK_ADD32mr;
+ }
+ break;
+ case MVT::i64:
+ if (isInc)
+ Opc = X86::LOCK_INC64m;
+ else if (isDec)
+ Opc = X86::LOCK_DEC64m;
+ else if (isSub) {
+ Opc = X86::LOCK_SUB64mr;
+ if (isCN) {
+ if (Predicate_immSext8(Val.getNode()))
+ Opc = X86::LOCK_SUB64mi8;
+ else if (Predicate_i64immSExt32(Val.getNode()))
+ Opc = X86::LOCK_SUB64mi32;
+ }
+ } else {
+ Opc = X86::LOCK_ADD64mr;
+ if (isCN) {
+ if (Predicate_immSext8(Val.getNode()))
+ Opc = X86::LOCK_ADD64mi8;
+ else if (Predicate_i64immSExt32(Val.getNode()))
+ Opc = X86::LOCK_ADD64mi32;
+ }
+ }
+ break;
+ }
+
+ DebugLoc dl = Node->getDebugLoc();
+ SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ dl, NVT), 0);
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+ if (isInc || isDec) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain };
+ SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 6), 0);
+ cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
+ SDValue RetVals[] = { Undef, Ret };
+ return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
+ } else {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
+ SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0);
+ cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
+ SDValue RetVals[] = { Undef, Ret };
+ return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
+ }
+}
+
+/// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has
+/// any uses which require the SF or OF bits to be accurate.
+static bool HasNoSignedComparisonUses(SDNode *N) {
+ // Examine each user of the node.
+ for (SDNode::use_iterator UI = N->use_begin(),
+ UE = N->use_end(); UI != UE; ++UI) {
+ // Only examine CopyToReg uses.
+ if (UI->getOpcode() != ISD::CopyToReg)
+ return false;
+ // Only examine CopyToReg uses that copy to EFLAGS.
+ if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() !=
+ X86::EFLAGS)
+ return false;
+ // Examine each user of the CopyToReg use.
+ for (SDNode::use_iterator FlagUI = UI->use_begin(),
+ FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
+ // Only examine the Flag result.
+ if (FlagUI.getUse().getResNo() != 1) continue;
+ // Anything unusual: assume conservatively.
+ if (!FlagUI->isMachineOpcode()) return false;
+ // Examine the opcode of the user.
+ switch (FlagUI->getMachineOpcode()) {
+ // These comparisons don't treat the most significant bit specially.
+ case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr:
+ case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
+ case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
+ case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
+ case X86::JA_4: case X86::JAE_4: case X86::JB_4: case X86::JBE_4:
+ case X86::JE_4: case X86::JNE_4: case X86::JP_4: case X86::JNP_4:
+ case X86::CMOVA16rr: case X86::CMOVA16rm:
+ case X86::CMOVA32rr: case X86::CMOVA32rm:
+ case X86::CMOVA64rr: case X86::CMOVA64rm:
+ case X86::CMOVAE16rr: case X86::CMOVAE16rm:
+ case X86::CMOVAE32rr: case X86::CMOVAE32rm:
+ case X86::CMOVAE64rr: case X86::CMOVAE64rm:
+ case X86::CMOVB16rr: case X86::CMOVB16rm:
+ case X86::CMOVB32rr: case X86::CMOVB32rm:
+ case X86::CMOVB64rr: case X86::CMOVB64rm:
+ case X86::CMOVBE16rr: case X86::CMOVBE16rm:
+ case X86::CMOVBE32rr: case X86::CMOVBE32rm:
+ case X86::CMOVBE64rr: case X86::CMOVBE64rm:
+ case X86::CMOVE16rr: case X86::CMOVE16rm:
+ case X86::CMOVE32rr: case X86::CMOVE32rm:
+ case X86::CMOVE64rr: case X86::CMOVE64rm:
+ case X86::CMOVNE16rr: case X86::CMOVNE16rm:
+ case X86::CMOVNE32rr: case X86::CMOVNE32rm:
+ case X86::CMOVNE64rr: case X86::CMOVNE64rm:
+ case X86::CMOVNP16rr: case X86::CMOVNP16rm:
+ case X86::CMOVNP32rr: case X86::CMOVNP32rm:
+ case X86::CMOVNP64rr: case X86::CMOVNP64rm:
+ case X86::CMOVP16rr: case X86::CMOVP16rm:
+ case X86::CMOVP32rr: case X86::CMOVP32rm:
+ case X86::CMOVP64rr: case X86::CMOVP64rm:
+ continue;
+ // Anything else: assume conservatively.
+ default: return false;
+ }
+ }
+ }
+ return true;
+}
+
+SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
+ EVT NVT = Node->getValueType(0);
+ unsigned Opc, MOpc;
+ unsigned Opcode = Node->getOpcode();
+ DebugLoc dl = Node->getDebugLoc();
+
+ DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
+
+ if (Node->isMachineOpcode()) {
+ DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
+ return NULL; // Already selected.
+ }
+
+ switch (Opcode) {
+ default: break;
+ case X86ISD::GlobalBaseReg:
+ return getGlobalBaseReg();
+
+ case X86ISD::ATOMOR64_DAG:
+ return SelectAtomic64(Node, X86::ATOMOR6432);
+ case X86ISD::ATOMXOR64_DAG:
+ return SelectAtomic64(Node, X86::ATOMXOR6432);
+ case X86ISD::ATOMADD64_DAG:
+ return SelectAtomic64(Node, X86::ATOMADD6432);
+ case X86ISD::ATOMSUB64_DAG:
+ return SelectAtomic64(Node, X86::ATOMSUB6432);
+ case X86ISD::ATOMNAND64_DAG:
+ return SelectAtomic64(Node, X86::ATOMNAND6432);
+ case X86ISD::ATOMAND64_DAG:
+ return SelectAtomic64(Node, X86::ATOMAND6432);
+ case X86ISD::ATOMSWAP64_DAG:
+ return SelectAtomic64(Node, X86::ATOMSWAP6432);
+
+ case ISD::ATOMIC_LOAD_ADD: {
+ SDNode *RetVal = SelectAtomicLoadAdd(Node, NVT);
+ if (RetVal)
+ return RetVal;
+ break;
+ }
+
+ case ISD::SMUL_LOHI:
+ case ISD::UMUL_LOHI: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ bool isSigned = Opcode == ISD::SMUL_LOHI;
+ if (!isSigned) {
+ switch (NVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break;
+ case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
+ case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
+ case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
+ }
+ } else {
+ switch (NVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break;
+ case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
+ case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+ case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
+ }
+ }
+
+ unsigned LoReg, HiReg;
+ switch (NVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: LoReg = X86::AL; HiReg = X86::AH; break;
+ case MVT::i16: LoReg = X86::AX; HiReg = X86::DX; break;
+ case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break;
+ case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break;
+ }
+
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ // Multiply is commmutative.
+ if (!foldedLoad) {
+ foldedLoad = TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ if (foldedLoad)
+ std::swap(N0, N1);
+ }
+
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+ N0, SDValue()).getValue(1);
+
+ if (foldedLoad) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+ InFlag };
+ SDNode *CNode =
+ CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Flag, Ops,
+ array_lengthof(Ops));
+ InFlag = SDValue(CNode, 1);
+ // Update the chain.
+ ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+ } else {
+ InFlag =
+ SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Flag, N1, InFlag), 0);
+ }
+
+ // Copy the low half of the result, if it is needed.
+ if (!SDValue(Node, 0).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ LoReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ ReplaceUses(SDValue(Node, 0), Result);
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+ // Copy the high half of the result, if it is needed.
+ if (!SDValue(Node, 1).use_empty()) {
+ SDValue Result;
+ if (HiReg == X86::AH && Subtarget->is64Bit()) {
+ // Prevent use of AH in a REX instruction by referencing AX instead.
+ // Shift it down 8 bits.
+ Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ X86::AX, MVT::i16, InFlag);
+ InFlag = Result.getValue(2);
+ Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+ Result,
+ CurDAG->getTargetConstant(8, MVT::i8)), 0);
+ // Then truncate it down to i8.
+ Result = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
+ MVT::i8, Result);
+ } else {
+ Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ HiReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ }
+ ReplaceUses(SDValue(Node, 1), Result);
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+
+ return NULL;
+ }
+
+ case ISD::SDIVREM:
+ case ISD::UDIVREM: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ bool isSigned = Opcode == ISD::SDIVREM;
+ if (!isSigned) {
+ switch (NVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
+ case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
+ case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+ case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+ }
+ } else {
+ switch (NVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
+ case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+ case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+ case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+ }
+ }
+
+ unsigned LoReg, HiReg, ClrReg;
+ unsigned ClrOpcode, SExtOpcode;
+ switch (NVT.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i8:
+ LoReg = X86::AL; ClrReg = HiReg = X86::AH;
+ ClrOpcode = 0;
+ SExtOpcode = X86::CBW;
+ break;
+ case MVT::i16:
+ LoReg = X86::AX; HiReg = X86::DX;
+ ClrOpcode = X86::MOV16r0; ClrReg = X86::DX;
+ SExtOpcode = X86::CWD;
+ break;
+ case MVT::i32:
+ LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
+ ClrOpcode = X86::MOV32r0;
+ SExtOpcode = X86::CDQ;
+ break;
+ case MVT::i64:
+ LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
+ ClrOpcode = X86::MOV64r0;
+ SExtOpcode = X86::CQO;
+ break;
+ }
+
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+ bool signBitIsZero = CurDAG->SignBitIsZero(N0);
+
+ SDValue InFlag;
+ if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
+ // Special case for div8, just use a move with zero extension to AX to
+ // clear the upper 8 bits (AH).
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
+ if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+ Move =
+ SDValue(CurDAG->getMachineNode(X86::MOVZX16rm8, dl, MVT::i16,
+ MVT::Other, Ops,
+ array_lengthof(Ops)), 0);
+ Chain = Move.getValue(1);
+ ReplaceUses(N0.getValue(1), Chain);
+ } else {
+ Move =
+ SDValue(CurDAG->getMachineNode(X86::MOVZX16rr8, dl, MVT::i16, N0),0);
+ Chain = CurDAG->getEntryNode();
+ }
+ Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, Move, SDValue());
+ InFlag = Chain.getValue(1);
+ } else {
+ InFlag =
+ CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
+ LoReg, N0, SDValue()).getValue(1);
+ if (isSigned && !signBitIsZero) {
+ // Sign extend the low part into the high part.
+ InFlag =
+ SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Flag, InFlag),0);
+ } else {
+ // Zero out the high part, effectively zero extending the input.
+ SDValue ClrNode =
+ SDValue(CurDAG->getMachineNode(ClrOpcode, dl, NVT), 0);
+ InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
+ ClrNode, InFlag).getValue(1);
+ }
+ }
+
+ if (foldedLoad) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+ InFlag };
+ SDNode *CNode =
+ CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Flag, Ops,
+ array_lengthof(Ops));
+ InFlag = SDValue(CNode, 1);
+ // Update the chain.
+ ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+ } else {
+ InFlag =
+ SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Flag, N1, InFlag), 0);
+ }
+
+ // Copy the division (low) result, if it is needed.
+ if (!SDValue(Node, 0).use_empty()) {
+ SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ LoReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ ReplaceUses(SDValue(Node, 0), Result);
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+ // Copy the remainder (high) result, if it is needed.
+ if (!SDValue(Node, 1).use_empty()) {
+ SDValue Result;
+ if (HiReg == X86::AH && Subtarget->is64Bit()) {
+ // Prevent use of AH in a REX instruction by referencing AX instead.
+ // Shift it down 8 bits.
+ Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ X86::AX, MVT::i16, InFlag);
+ InFlag = Result.getValue(2);
+ Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+ Result,
+ CurDAG->getTargetConstant(8, MVT::i8)),
+ 0);
+ // Then truncate it down to i8.
+ Result = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
+ MVT::i8, Result);
+ } else {
+ Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+ HiReg, NVT, InFlag);
+ InFlag = Result.getValue(2);
+ }
+ ReplaceUses(SDValue(Node, 1), Result);
+ DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ }
+ return NULL;
+ }
+
+ case X86ISD::CMP: {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
+ // use a smaller encoding.
+ if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
+ // Look past the truncate if CMP is the only use of it.
+ N0 = N0.getOperand(0);
+ if (N0.getNode()->getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
+ N0.getValueType() != MVT::i8 &&
+ X86::isZeroNode(N1)) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
+ if (!C) break;
+
+ // For example, convert "testl %eax, $8" to "testb %al, $8"
+ if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 &&
+ (!(C->getZExtValue() & 0x80) ||
+ HasNoSignedComparisonUses(Node))) {
+ SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i8);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // On x86-32, only the ABCD registers have 8-bit subregisters.
+ if (!Subtarget->is64Bit()) {
+ TargetRegisterClass *TRC = 0;
+ switch (N0.getValueType().getSimpleVT().SimpleTy) {
+ case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+ case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+ default: llvm_unreachable("Unsupported TEST operand type!");
+ }
+ SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
+ Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+ Reg.getValueType(), Reg, RC), 0);
+ }
+
+ // Extract the l-register.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
+ MVT::i8, Reg);
+
+ // Emit a testb.
+ return CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, Subreg, Imm);
+ }
+
+ // For example, "testl %eax, $2048" to "testb %ah, $8".
+ if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 &&
+ (!(C->getZExtValue() & 0x8000) ||
+ HasNoSignedComparisonUses(Node))) {
+ // Shift the immediate right by 8 bits.
+ SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
+ MVT::i8);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // Put the value in an ABCD register.
+ TargetRegisterClass *TRC = 0;
+ switch (N0.getValueType().getSimpleVT().SimpleTy) {
+ case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
+ case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+ case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+ default: llvm_unreachable("Unsupported TEST operand type!");
+ }
+ SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
+ Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+ Reg.getValueType(), Reg, RC), 0);
+
+ // Extract the h-register.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
+ MVT::i8, Reg);
+
+ // Emit a testb. No special NOREX tricks are needed since there's
+ // only one GPR operand!
+ return CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
+ Subreg, ShiftedImm);
+ }
+
+ // For example, "testl %eax, $32776" to "testw %ax, $32776".
+ if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 &&
+ N0.getValueType() != MVT::i16 &&
+ (!(C->getZExtValue() & 0x8000) ||
+ HasNoSignedComparisonUses(Node))) {
+ SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i16);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // Extract the 16-bit subregister.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
+ MVT::i16, Reg);
+
+ // Emit a testw.
+ return CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32, Subreg, Imm);
+ }
+
+ // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
+ if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 &&
+ N0.getValueType() == MVT::i64 &&
+ (!(C->getZExtValue() & 0x80000000) ||
+ HasNoSignedComparisonUses(Node))) {
+ SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+ SDValue Reg = N0.getNode()->getOperand(0);
+
+ // Extract the 32-bit subregister.
+ SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
+ MVT::i32, Reg);
+
+ // Emit a testl.
+ return CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32, Subreg, Imm);
+ }
+ }
+ break;
+ }
+ }
+
+ SDNode *ResNode = SelectCode(Node);
+
+ DEBUG(dbgs() << "=> ";
+ if (ResNode == NULL || ResNode == Node)
+ Node->dump(CurDAG);
+ else
+ ResNode->dump(CurDAG);
+ dbgs() << '\n');
+
+ return ResNode;
+}
+
+bool X86DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+ std::vector<SDValue> &OutOps) {
+ SDValue Op0, Op1, Op2, Op3, Op4;
+ switch (ConstraintCode) {
+ case 'o': // offsetable ??
+ case 'v': // not offsetable ??
+ default: return true;
+ case 'm': // memory
+ if (!SelectAddr(Op.getNode(), Op, Op0, Op1, Op2, Op3, Op4))
+ return true;
+ break;
+ }
+
+ OutOps.push_back(Op0);
+ OutOps.push_back(Op1);
+ OutOps.push_back(Op2);
+ OutOps.push_back(Op3);
+ OutOps.push_back(Op4);
+ return false;
+}
+
+/// createX86ISelDag - This pass converts a legalized DAG into a
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
+ llvm::CodeGenOpt::Level OptLevel) {
+ return new X86DAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
new file mode 100644
index 0000000..b02c33d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -0,0 +1,10448 @@
+//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-isel"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86TargetMachine.h"
+#include "X86TargetObjectFile.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+using namespace dwarf;
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+static cl::opt<bool>
+DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX"));
+
+// Forward declarations.
+static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+ SDValue V2);
+
+static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
+ switch (TM.getSubtarget<X86Subtarget>().TargetType) {
+ default: llvm_unreachable("unknown subtarget type");
+ case X86Subtarget::isDarwin:
+ if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ return new X8664_MachoTargetObjectFile();
+ return new TargetLoweringObjectFileMachO();
+ case X86Subtarget::isELF:
+ if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ return new X8664_ELFTargetObjectFile(TM);
+ return new X8632_ELFTargetObjectFile(TM);
+ case X86Subtarget::isMingw:
+ case X86Subtarget::isCygwin:
+ case X86Subtarget::isWindows:
+ return new TargetLoweringObjectFileCOFF();
+ }
+}
+
+X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
+ : TargetLowering(TM, createTLOF(TM)) {
+ Subtarget = &TM.getSubtarget<X86Subtarget>();
+ X86ScalarSSEf64 = Subtarget->hasSSE2();
+ X86ScalarSSEf32 = Subtarget->hasSSE1();
+ X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
+
+ RegInfo = TM.getRegisterInfo();
+ TD = getTargetData();
+
+ // Set up the TargetLowering object.
+
+ // X86 is weird, it always uses i8 for shift amounts and setcc results.
+ setShiftAmountType(MVT::i8);
+ setBooleanContents(ZeroOrOneBooleanContent);
+ setSchedulingPreference(Sched::RegPressure);
+ setStackPointerRegisterToSaveRestore(X86StackPtr);
+
+ if (Subtarget->isTargetDarwin()) {
+ // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
+ setUseUnderscoreSetJmp(false);
+ setUseUnderscoreLongJmp(false);
+ } else if (Subtarget->isTargetMingw()) {
+ // MS runtime is weird: it exports _setjmp, but longjmp!
+ setUseUnderscoreSetJmp(true);
+ setUseUnderscoreLongJmp(false);
+ } else {
+ setUseUnderscoreSetJmp(true);
+ setUseUnderscoreLongJmp(true);
+ }
+
+ // Set up the register classes.
+ addRegisterClass(MVT::i8, X86::GR8RegisterClass);
+ addRegisterClass(MVT::i16, X86::GR16RegisterClass);
+ addRegisterClass(MVT::i32, X86::GR32RegisterClass);
+ if (Subtarget->is64Bit())
+ addRegisterClass(MVT::i64, X86::GR64RegisterClass);
+
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+ // We don't accept any truncstore of integer registers.
+ setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
+ setTruncStoreAction(MVT::i32, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
+ setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+
+ // SETOEQ and SETUNE require checking two conditions.
+ setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+ setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
+
+ // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+ // operation.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand);
+ } else if (!UseSoftFloat) {
+ // We have an algorithm for SSE2->double, and we turn this into a
+ // 64-bit FILD followed by conditional FADD for other targets.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
+ // We have an algorithm for SSE2, and we turn this into a 64-bit
+ // FILD for other targets.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
+ }
+
+ // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
+
+ if (!UseSoftFloat) {
+ // SSE has no i16 to fp conversion, only i32
+ if (X86ScalarSSEf32) {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
+ // f32 and f64 cases are Legal, f80 case is not
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
+ } else {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
+ }
+ } else {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
+ }
+
+ // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
+ // are Legal, f80 is custom lowered.
+ setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
+
+ // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
+
+ if (X86ScalarSSEf32) {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
+ // f32 and f64 cases are Legal, f80 case is not
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
+ } else {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
+ }
+
+ // Handle FP_TO_UINT by promoting the destination to a larger signed
+ // conversion.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
+ } else if (!UseSoftFloat) {
+ if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
+ // Expand FP_TO_UINT into a select.
+ // FIXME: We would like to use a Custom expander here eventually to do
+ // the optimal thing for SSE vs. the default expansion in the legalizer.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
+ else
+ // With SSE3 we can use fisttpll to convert to a signed i64; without
+ // SSE, we're stuck with a fistpll.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
+ }
+
+ // TODO: when we have SSE, these could be more efficient, by using movd/movq.
+ if (!X86ScalarSSEf64) {
+ setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand);
+ setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand);
+ // Without SSE, i64->f64 goes through memory; i64->MMX is Legal.
+ if (Subtarget->hasMMX() && !DisableMMX)
+ setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Custom);
+ else
+ setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand);
+ }
+ }
+
+ // Scalar integer divide and remainder are lowered to use operations that
+ // produce two results, to match the available instructions. This exposes
+ // the two-result form to trivial CSE, which is able to combine x/y and x%y
+ // into a single instruction.
+ //
+ // Scalar integer multiply-high is also lowered to use two-result
+ // operations, to match the available instructions. However, plain multiply
+ // (low) operations are left as Legal, as there are single-result
+ // instructions for this in x86. Using the two-result multiply instructions
+ // when both high and low results are needed must be arranged by dagcombine.
+ setOperationAction(ISD::MULHS , MVT::i8 , Expand);
+ setOperationAction(ISD::MULHU , MVT::i8 , Expand);
+ setOperationAction(ISD::SDIV , MVT::i8 , Expand);
+ setOperationAction(ISD::UDIV , MVT::i8 , Expand);
+ setOperationAction(ISD::SREM , MVT::i8 , Expand);
+ setOperationAction(ISD::UREM , MVT::i8 , Expand);
+ setOperationAction(ISD::MULHS , MVT::i16 , Expand);
+ setOperationAction(ISD::MULHU , MVT::i16 , Expand);
+ setOperationAction(ISD::SDIV , MVT::i16 , Expand);
+ setOperationAction(ISD::UDIV , MVT::i16 , Expand);
+ setOperationAction(ISD::SREM , MVT::i16 , Expand);
+ setOperationAction(ISD::UREM , MVT::i16 , Expand);
+ setOperationAction(ISD::MULHS , MVT::i32 , Expand);
+ setOperationAction(ISD::MULHU , MVT::i32 , Expand);
+ setOperationAction(ISD::SDIV , MVT::i32 , Expand);
+ setOperationAction(ISD::UDIV , MVT::i32 , Expand);
+ setOperationAction(ISD::SREM , MVT::i32 , Expand);
+ setOperationAction(ISD::UREM , MVT::i32 , Expand);
+ setOperationAction(ISD::MULHS , MVT::i64 , Expand);
+ setOperationAction(ISD::MULHU , MVT::i64 , Expand);
+ setOperationAction(ISD::SDIV , MVT::i64 , Expand);
+ setOperationAction(ISD::UDIV , MVT::i64 , Expand);
+ setOperationAction(ISD::SREM , MVT::i64 , Expand);
+ setOperationAction(ISD::UREM , MVT::i64 , Expand);
+
+ setOperationAction(ISD::BR_JT , MVT::Other, Expand);
+ setOperationAction(ISD::BRCOND , MVT::Other, Custom);
+ setOperationAction(ISD::BR_CC , MVT::Other, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::Other, Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+ setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
+ setOperationAction(ISD::FREM , MVT::f32 , Expand);
+ setOperationAction(ISD::FREM , MVT::f64 , Expand);
+ setOperationAction(ISD::FREM , MVT::f80 , Expand);
+ setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
+
+ setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
+ setOperationAction(ISD::CTTZ , MVT::i8 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
+ setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
+ setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
+ setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
+ setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
+ setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
+ setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
+ }
+
+ setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
+ setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
+
+ // These should be promoted to a larger select which is supported.
+ setOperationAction(ISD::SELECT , MVT::i1 , Promote);
+ // X86 wants to expand cmov itself.
+ setOperationAction(ISD::SELECT , MVT::i8 , Custom);
+ setOperationAction(ISD::SELECT , MVT::i16 , Custom);
+ setOperationAction(ISD::SELECT , MVT::i32 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f32 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f64 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f80 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i8 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i16 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i32 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f32 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f64 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f80 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::SELECT , MVT::i64 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i64 , Custom);
+ }
+ setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
+
+ // Darwin ABI issue.
+ setOperationAction(ISD::ConstantPool , MVT::i32 , Custom);
+ setOperationAction(ISD::JumpTable , MVT::i32 , Custom);
+ setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom);
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+ setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom);
+ setOperationAction(ISD::BlockAddress , MVT::i32 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::ConstantPool , MVT::i64 , Custom);
+ setOperationAction(ISD::JumpTable , MVT::i64 , Custom);
+ setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom);
+ setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom);
+ setOperationAction(ISD::BlockAddress , MVT::i64 , Custom);
+ }
+ // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+ setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom);
+ setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom);
+ setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom);
+ setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom);
+ setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom);
+ }
+
+ if (Subtarget->hasSSE1())
+ setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
+
+ if (!Subtarget->hasSSE2())
+ setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand);
+
+ // Expand certain atomics
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
+
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
+
+ if (!Subtarget->is64Bit()) {
+ setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
+ }
+
+ // FIXME - use subtarget debug flags
+ if (!Subtarget->isTargetDarwin() &&
+ !Subtarget->isTargetELF() &&
+ !Subtarget->isTargetCygMing()) {
+ setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+ }
+
+ setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+ setOperationAction(ISD::EHSELECTION, MVT::i64, Expand);
+ setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+ setOperationAction(ISD::EHSELECTION, MVT::i32, Expand);
+ if (Subtarget->is64Bit()) {
+ setExceptionPointerRegister(X86::RAX);
+ setExceptionSelectorRegister(X86::RDX);
+ } else {
+ setExceptionPointerRegister(X86::EAX);
+ setExceptionSelectorRegister(X86::EDX);
+ }
+ setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+ setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
+
+ setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
+
+ setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+ // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+ setOperationAction(ISD::VASTART , MVT::Other, Custom);
+ setOperationAction(ISD::VAEND , MVT::Other, Expand);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::VAARG , MVT::Other, Custom);
+ setOperationAction(ISD::VACOPY , MVT::Other, Custom);
+ } else {
+ setOperationAction(ISD::VAARG , MVT::Other, Expand);
+ setOperationAction(ISD::VACOPY , MVT::Other, Expand);
+ }
+
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+ if (Subtarget->isTargetCygMing())
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+ else
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
+ if (!UseSoftFloat && X86ScalarSSEf64) {
+ // f32 and f64 use SSE.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f32, X86::FR32RegisterClass);
+ addRegisterClass(MVT::f64, X86::FR64RegisterClass);
+
+ // Use ANDPD to simulate FABS.
+ setOperationAction(ISD::FABS , MVT::f64, Custom);
+ setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+ // Use XORP to simulate FNEG.
+ setOperationAction(ISD::FNEG , MVT::f64, Custom);
+ setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+ // Use ANDPD and ORPD to simulate FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+ // We don't support sin/cos/fmod
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+
+ // Expand FP immediates into loads from the stack, except for the special
+ // cases we handle.
+ addLegalFPImmediate(APFloat(+0.0)); // xorpd
+ addLegalFPImmediate(APFloat(+0.0f)); // xorps
+ } else if (!UseSoftFloat && X86ScalarSSEf32) {
+ // Use SSE for f32, x87 for f64.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f32, X86::FR32RegisterClass);
+ addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+
+ // Use ANDPS to simulate FABS.
+ setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+ // Use XORP to simulate FNEG.
+ setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+ setOperationAction(ISD::UNDEF, MVT::f64, Expand);
+
+ // Use ANDPS and ORPS to simulate FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+ // We don't support sin/cos/fmod
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+
+ // Special cases we handle for FP constants.
+ addLegalFPImmediate(APFloat(+0.0f)); // xorps
+ addLegalFPImmediate(APFloat(+0.0)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+
+ if (!UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f64 , Expand);
+ setOperationAction(ISD::FCOS , MVT::f64 , Expand);
+ }
+ } else if (!UseSoftFloat) {
+ // f32 and f64 in x87.
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+ addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
+
+ setOperationAction(ISD::UNDEF, MVT::f64, Expand);
+ setOperationAction(ISD::UNDEF, MVT::f32, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+ if (!UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f64 , Expand);
+ setOperationAction(ISD::FCOS , MVT::f64 , Expand);
+ }
+ addLegalFPImmediate(APFloat(+0.0)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+ addLegalFPImmediate(APFloat(+0.0f)); // FLD0
+ addLegalFPImmediate(APFloat(+1.0f)); // FLD1
+ addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
+ addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
+ }
+
+ // Long double always uses X87.
+ if (!UseSoftFloat) {
+ addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
+ setOperationAction(ISD::UNDEF, MVT::f80, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
+ {
+ bool ignored;
+ APFloat TmpFlt(+0.0);
+ TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+ &ignored);
+ addLegalFPImmediate(TmpFlt); // FLD0
+ TmpFlt.changeSign();
+ addLegalFPImmediate(TmpFlt); // FLD0/FCHS
+ APFloat TmpFlt2(+1.0);
+ TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+ &ignored);
+ addLegalFPImmediate(TmpFlt2); // FLD1
+ TmpFlt2.changeSign();
+ addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
+ }
+
+ if (!UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f80 , Expand);
+ setOperationAction(ISD::FCOS , MVT::f80 , Expand);
+ }
+ }
+
+ // Always use a library call for pow.
+ setOperationAction(ISD::FPOW , MVT::f32 , Expand);
+ setOperationAction(ISD::FPOW , MVT::f64 , Expand);
+ setOperationAction(ISD::FPOW , MVT::f80 , Expand);
+
+ setOperationAction(ISD::FLOG, MVT::f80, Expand);
+ setOperationAction(ISD::FLOG2, MVT::f80, Expand);
+ setOperationAction(ISD::FLOG10, MVT::f80, Expand);
+ setOperationAction(ISD::FEXP, MVT::f80, Expand);
+ setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+
+ // First set operation action for all vector types to either promote
+ // (for widening) or expand (for scalarization). Then we will selectively
+ // turn on ones that can be effectively codegen'd.
+ for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+ VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+ setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
+ setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand);
+ setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand);
+ for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+ InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+ setTruncStoreAction((MVT::SimpleValueType)VT,
+ (MVT::SimpleValueType)InnerVT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+ }
+
+ // FIXME: In order to prevent SSE instructions being expanded to MMX ones
+ // with -msoft-float, disable use of MMX as well.
+ if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) {
+ addRegisterClass(MVT::v8i8, X86::VR64RegisterClass, false);
+ addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false);
+ addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false);
+ addRegisterClass(MVT::v2f32, X86::VR64RegisterClass, false);
+ addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false);
+
+ setOperationAction(ISD::ADD, MVT::v8i8, Legal);
+ setOperationAction(ISD::ADD, MVT::v4i16, Legal);
+ setOperationAction(ISD::ADD, MVT::v2i32, Legal);
+ setOperationAction(ISD::ADD, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::SUB, MVT::v8i8, Legal);
+ setOperationAction(ISD::SUB, MVT::v4i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v2i32, Legal);
+ setOperationAction(ISD::SUB, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::MULHS, MVT::v4i16, Legal);
+ setOperationAction(ISD::MUL, MVT::v4i16, Legal);
+
+ setOperationAction(ISD::AND, MVT::v8i8, Promote);
+ AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64);
+ setOperationAction(ISD::AND, MVT::v4i16, Promote);
+ AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64);
+ setOperationAction(ISD::AND, MVT::v2i32, Promote);
+ AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64);
+ setOperationAction(ISD::AND, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::OR, MVT::v8i8, Promote);
+ AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64);
+ setOperationAction(ISD::OR, MVT::v4i16, Promote);
+ AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64);
+ setOperationAction(ISD::OR, MVT::v2i32, Promote);
+ AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64);
+ setOperationAction(ISD::OR, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::XOR, MVT::v8i8, Promote);
+ AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64);
+ setOperationAction(ISD::XOR, MVT::v4i16, Promote);
+ AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64);
+ setOperationAction(ISD::XOR, MVT::v2i32, Promote);
+ AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64);
+ setOperationAction(ISD::XOR, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::LOAD, MVT::v8i8, Promote);
+ AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64);
+ setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
+ AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64);
+ setOperationAction(ISD::LOAD, MVT::v2i32, Promote);
+ AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64);
+ setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+ AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64);
+ setOperationAction(ISD::LOAD, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
+
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
+
+ setOperationAction(ISD::SELECT, MVT::v8i8, Promote);
+ setOperationAction(ISD::SELECT, MVT::v4i16, Promote);
+ setOperationAction(ISD::SELECT, MVT::v2i32, Promote);
+ setOperationAction(ISD::SELECT, MVT::v1i64, Custom);
+ setOperationAction(ISD::VSETCC, MVT::v8i8, Custom);
+ setOperationAction(ISD::VSETCC, MVT::v4i16, Custom);
+ setOperationAction(ISD::VSETCC, MVT::v2i32, Custom);
+
+ if (!X86ScalarSSEf64 && Subtarget->is64Bit()) {
+ setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Custom);
+ setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Custom);
+ setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Custom);
+ setOperationAction(ISD::BIT_CONVERT, MVT::v2f32, Custom);
+ setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Custom);
+ }
+ }
+
+ if (!UseSoftFloat && Subtarget->hasSSE1()) {
+ addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
+
+ setOperationAction(ISD::FADD, MVT::v4f32, Legal);
+ setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
+ setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+ setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
+ setOperationAction(ISD::VSETCC, MVT::v4f32, Custom);
+ }
+
+ if (!UseSoftFloat && Subtarget->hasSSE2()) {
+ addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
+
+ // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
+ // registers cannot be used even for integer operations.
+ addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
+
+ setOperationAction(ISD::ADD, MVT::v16i8, Legal);
+ setOperationAction(ISD::ADD, MVT::v8i16, Legal);
+ setOperationAction(ISD::ADD, MVT::v4i32, Legal);
+ setOperationAction(ISD::ADD, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ setOperationAction(ISD::SUB, MVT::v16i8, Legal);
+ setOperationAction(ISD::SUB, MVT::v8i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v4i32, Legal);
+ setOperationAction(ISD::SUB, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v8i16, Legal);
+ setOperationAction(ISD::FADD, MVT::v2f64, Legal);
+ setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
+ setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
+ setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
+ setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
+
+ setOperationAction(ISD::VSETCC, MVT::v2f64, Custom);
+ setOperationAction(ISD::VSETCC, MVT::v16i8, Custom);
+ setOperationAction(ISD::VSETCC, MVT::v8i16, Custom);
+ setOperationAction(ISD::VSETCC, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
+
+ // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+ for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
+ EVT VT = (MVT::SimpleValueType)i;
+ // Do not attempt to custom lower non-power-of-2 vectors
+ if (!isPowerOf2_32(VT.getVectorNumElements()))
+ continue;
+ // Do not attempt to custom lower non-128-bit vectors
+ if (!VT.is128BitVector())
+ continue;
+ setOperationAction(ISD::BUILD_VECTOR,
+ VT.getSimpleVT().SimpleTy, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE,
+ VT.getSimpleVT().SimpleTy, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT,
+ VT.getSimpleVT().SimpleTy, Custom);
+ }
+
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+ }
+
+ // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
+ for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
+ MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
+ EVT VT = SVT;
+
+ // Do not attempt to promote non-128-bit vectors
+ if (!VT.is128BitVector()) {
+ continue;
+ }
+
+ setOperationAction(ISD::AND, SVT, Promote);
+ AddPromotedToType (ISD::AND, SVT, MVT::v2i64);
+ setOperationAction(ISD::OR, SVT, Promote);
+ AddPromotedToType (ISD::OR, SVT, MVT::v2i64);
+ setOperationAction(ISD::XOR, SVT, Promote);
+ AddPromotedToType (ISD::XOR, SVT, MVT::v2i64);
+ setOperationAction(ISD::LOAD, SVT, Promote);
+ AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64);
+ setOperationAction(ISD::SELECT, SVT, Promote);
+ AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
+ }
+
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+ // Custom lower v2i64 and v2f64 selects.
+ setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
+ setOperationAction(ISD::LOAD, MVT::v2i64, Legal);
+ setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+ if (!DisableMMX && Subtarget->hasMMX()) {
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+ }
+ }
+
+ if (Subtarget->hasSSE41()) {
+ // FIXME: Do we need to handle scalar-to-vector here?
+ setOperationAction(ISD::MUL, MVT::v4i32, Legal);
+
+ // i8 and i16 vectors are custom , because the source register and source
+ // source memory operand types are not the same width. f32 vectors are
+ // custom since the immediate controlling the insert encodes additional
+ // information.
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
+ }
+ }
+
+ if (Subtarget->hasSSE42()) {
+ setOperationAction(ISD::VSETCC, MVT::v2i64, Custom);
+ }
+
+ if (!UseSoftFloat && Subtarget->hasAVX()) {
+ addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
+ addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
+
+ setOperationAction(ISD::LOAD, MVT::v8f32, Legal);
+ setOperationAction(ISD::LOAD, MVT::v8i32, Legal);
+ setOperationAction(ISD::LOAD, MVT::v4f64, Legal);
+ setOperationAction(ISD::LOAD, MVT::v4i64, Legal);
+ setOperationAction(ISD::FADD, MVT::v8f32, Legal);
+ setOperationAction(ISD::FSUB, MVT::v8f32, Legal);
+ setOperationAction(ISD::FMUL, MVT::v8f32, Legal);
+ setOperationAction(ISD::FDIV, MVT::v8f32, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v8f32, Legal);
+ setOperationAction(ISD::FNEG, MVT::v8f32, Custom);
+ //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom);
+ //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom);
+ //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom);
+ //setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
+ //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom);
+
+ // Operations to consider commented out -v16i16 v32i8
+ //setOperationAction(ISD::ADD, MVT::v16i16, Legal);
+ setOperationAction(ISD::ADD, MVT::v8i32, Custom);
+ setOperationAction(ISD::ADD, MVT::v4i64, Custom);
+ //setOperationAction(ISD::SUB, MVT::v32i8, Legal);
+ //setOperationAction(ISD::SUB, MVT::v16i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v8i32, Custom);
+ setOperationAction(ISD::SUB, MVT::v4i64, Custom);
+ //setOperationAction(ISD::MUL, MVT::v16i16, Legal);
+ setOperationAction(ISD::FADD, MVT::v4f64, Legal);
+ setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
+ setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
+ setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
+ setOperationAction(ISD::FNEG, MVT::v4f64, Custom);
+
+ setOperationAction(ISD::VSETCC, MVT::v4f64, Custom);
+ // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom);
+ // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom);
+ setOperationAction(ISD::VSETCC, MVT::v8i32, Custom);
+
+ // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom);
+ // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom);
+ // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom);
+
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom);
+
+#if 0
+ // Not sure we want to do this since there are no 256-bit integer
+ // operations in AVX
+
+ // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+ // This includes 256-bit vectors
+ for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) {
+ EVT VT = (MVT::SimpleValueType)i;
+
+ // Do not attempt to custom lower non-power-of-2 vectors
+ if (!isPowerOf2_32(VT.getVectorNumElements()))
+ continue;
+
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ }
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom);
+ }
+#endif
+
+#if 0
+ // Not sure we want to do this since there are no 256-bit integer
+ // operations in AVX
+
+ // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64.
+ // Including 256-bit vectors
+ for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) {
+ EVT VT = (MVT::SimpleValueType)i;
+
+ if (!VT.is256BitVector()) {
+ continue;
+ }
+ setOperationAction(ISD::AND, VT, Promote);
+ AddPromotedToType (ISD::AND, VT, MVT::v4i64);
+ setOperationAction(ISD::OR, VT, Promote);
+ AddPromotedToType (ISD::OR, VT, MVT::v4i64);
+ setOperationAction(ISD::XOR, VT, Promote);
+ AddPromotedToType (ISD::XOR, VT, MVT::v4i64);
+ setOperationAction(ISD::LOAD, VT, Promote);
+ AddPromotedToType (ISD::LOAD, VT, MVT::v4i64);
+ setOperationAction(ISD::SELECT, VT, Promote);
+ AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
+ }
+
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+#endif
+ }
+
+ // We want to custom lower some of our intrinsics.
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+ // Add/Sub/Mul with overflow operations are custom lowered.
+ setOperationAction(ISD::SADDO, MVT::i32, Custom);
+ setOperationAction(ISD::SADDO, MVT::i64, Custom);
+ setOperationAction(ISD::UADDO, MVT::i32, Custom);
+ setOperationAction(ISD::UADDO, MVT::i64, Custom);
+ setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+ setOperationAction(ISD::SSUBO, MVT::i64, Custom);
+ setOperationAction(ISD::USUBO, MVT::i32, Custom);
+ setOperationAction(ISD::USUBO, MVT::i64, Custom);
+ setOperationAction(ISD::SMULO, MVT::i32, Custom);
+ setOperationAction(ISD::SMULO, MVT::i64, Custom);
+
+ if (!Subtarget->is64Bit()) {
+ // These libcalls are not available in 32-bit.
+ setLibcallName(RTLIB::SHL_I128, 0);
+ setLibcallName(RTLIB::SRL_I128, 0);
+ setLibcallName(RTLIB::SRA_I128, 0);
+ }
+
+ // We have target-specific dag combine patterns for the following nodes:
+ setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::BUILD_VECTOR);
+ setTargetDAGCombine(ISD::SELECT);
+ setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::SRA);
+ setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::OR);
+ setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::MEMBARRIER);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
+ if (Subtarget->is64Bit())
+ setTargetDAGCombine(ISD::MUL);
+
+ computeRegisterProperties();
+
+ // FIXME: These should be based on subtarget info. Plus, the values should
+ // be smaller when we are in optimizing for size mode.
+ maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+ maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
+ maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
+ setPrefLoopAlignment(16);
+ benefitFromCodePlacementOpt = true;
+}
+
+
+MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
+ return MVT::i8;
+}
+
+
+/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
+ if (MaxAlign == 16)
+ return;
+ if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+ if (VTy->getBitWidth() == 128)
+ MaxAlign = 16;
+ } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+ unsigned EltAlign = 0;
+ getMaxByValAlign(ATy->getElementType(), EltAlign);
+ if (EltAlign > MaxAlign)
+ MaxAlign = EltAlign;
+ } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ unsigned EltAlign = 0;
+ getMaxByValAlign(STy->getElementType(i), EltAlign);
+ if (EltAlign > MaxAlign)
+ MaxAlign = EltAlign;
+ if (MaxAlign == 16)
+ break;
+ }
+ }
+ return;
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area. For X86, aggregates
+/// that contain SSE vectors are placed at 16-byte boundaries while the rest
+/// are at 4-byte boundaries.
+unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
+ if (Subtarget->is64Bit()) {
+ // Max of 8 and alignment of type.
+ unsigned TyAlign = TD->getABITypeAlignment(Ty);
+ if (TyAlign > 8)
+ return TyAlign;
+ return 8;
+ }
+
+ unsigned Align = 4;
+ if (Subtarget->hasSSE1())
+ getMaxByValAlign(Ty, Align);
+ return Align;
+}
+
+/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. If DstAlign is zero that means it's safe to destination
+/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+/// means there isn't a need to check it against alignment requirement,
+/// probably because the source does not need to be loaded. If
+/// 'NonScalarIntSafe' is true, that means it's safe to return a
+/// non-scalar-integer type, e.g. empty string source, constant, or loaded
+/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
+/// constant so it does not need to be loaded.
+/// It returns EVT::Other if the type should be determined using generic
+/// target-independent logic.
+EVT
+X86TargetLowering::getOptimalMemOpType(uint64_t Size,
+ unsigned DstAlign, unsigned SrcAlign,
+ bool NonScalarIntSafe,
+ bool MemcpyStrSrc,
+ MachineFunction &MF) const {
+ // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
+ // linux. This is because the stack realignment code can't handle certain
+ // cases like PR2962. This should be removed when PR2962 is fixed.
+ const Function *F = MF.getFunction();
+ if (NonScalarIntSafe &&
+ !F->hasFnAttr(Attribute::NoImplicitFloat)) {
+ if (Size >= 16 &&
+ (Subtarget->isUnalignedMemAccessFast() ||
+ ((DstAlign == 0 || DstAlign >= 16) &&
+ (SrcAlign == 0 || SrcAlign >= 16))) &&
+ Subtarget->getStackAlignment() >= 16) {
+ if (Subtarget->hasSSE2())
+ return MVT::v4i32;
+ if (Subtarget->hasSSE1())
+ return MVT::v4f32;
+ } else if (!MemcpyStrSrc && Size >= 8 &&
+ !Subtarget->is64Bit() &&
+ Subtarget->getStackAlignment() >= 8 &&
+ Subtarget->hasSSE2()) {
+ // Do not use f64 to lower memcpy if source is string constant. It's
+ // better to use i32 to avoid the loads.
+ return MVT::f64;
+ }
+ }
+ if (Subtarget->is64Bit() && Size >= 8)
+ return MVT::i64;
+ return MVT::i32;
+}
+
+/// getJumpTableEncoding - Return the entry encoding for a jump table in the
+/// current function. The returned value is a member of the
+/// MachineJumpTableInfo::JTEntryKind enum.
+unsigned X86TargetLowering::getJumpTableEncoding() const {
+ // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
+ // symbol.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT())
+ return MachineJumpTableInfo::EK_Custom32;
+
+ // Otherwise, use the normal jump table encoding heuristics.
+ return TargetLowering::getJumpTableEncoding();
+}
+
+/// getPICBaseSymbol - Return the X86-32 PIC base.
+MCSymbol *
+X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF,
+ MCContext &Ctx) const {
+ const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo();
+ return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+
+ Twine(MF->getFunctionNumber())+"$pb");
+}
+
+
+const MCExpr *
+X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+ const MachineBasicBlock *MBB,
+ unsigned uid,MCContext &Ctx) const{
+ assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT());
+ // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
+ // entries.
+ return MCSymbolRefExpr::Create(MBB->getSymbol(),
+ MCSymbolRefExpr::VK_GOTOFF, Ctx);
+}
+
+/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
+/// jumptable.
+SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const {
+ if (!Subtarget->is64Bit())
+ // This doesn't have DebugLoc associated with it, but is not really the
+ // same as a Register.
+ return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
+ return Table;
+}
+
+/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
+/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
+/// MCExpr.
+const MCExpr *X86TargetLowering::
+getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
+ MCContext &Ctx) const {
+ // X86-64 uses RIP relative addressing based on the jump table label.
+ if (Subtarget->isPICStyleRIPRel())
+ return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
+
+ // Otherwise, the reference is relative to the PIC base.
+ return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx);
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
+ return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4;
+}
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "X86GenCallingConv.inc"
+
+bool
+X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<EVT> &OutTys,
+ const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
+ SelectionDAG &DAG) const {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+ RVLocs, *DAG.getContext());
+ return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86);
+}
+
+SDValue
+X86TargetLowering::LowerReturn(SDValue Chain,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ DebugLoc dl, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+ RVLocs, *DAG.getContext());
+ CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+ // Add the regs to the liveout set for the function.
+ MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+ for (unsigned i = 0; i != RVLocs.size(); ++i)
+ if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
+ MRI.addLiveOut(RVLocs[i].getLocReg());
+
+ SDValue Flag;
+
+ SmallVector<SDValue, 6> RetOps;
+ RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+ // Operand #1 = Bytes To Pop
+ RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
+ MVT::i16));
+
+ // Copy the result values into the output registers.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+ SDValue ValToCopy = Outs[i].Val;
+
+ // Returns in ST0/ST1 are handled specially: these are pushed as operands to
+ // the RET instruction and handled by the FP Stackifier.
+ if (VA.getLocReg() == X86::ST0 ||
+ VA.getLocReg() == X86::ST1) {
+ // If this is a copy from an xmm register to ST(0), use an FPExtend to
+ // change the value to the FP stack register class.
+ if (isScalarFPTypeInSSEReg(VA.getValVT()))
+ ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
+ RetOps.push_back(ValToCopy);
+ // Don't emit a copytoreg.
+ continue;
+ }
+
+ // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
+ // which is returned in RAX / RDX.
+ if (Subtarget->is64Bit()) {
+ EVT ValVT = ValToCopy.getValueType();
+ if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
+ ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
+ if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1)
+ ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy);
+ }
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
+ Flag = Chain.getValue(1);
+ }
+
+ // The x86-64 ABI for returning structs by value requires that we copy
+ // the sret argument into %rax for the return. We saved the argument into
+ // a virtual register in the entry block, so now we copy the value out
+ // and into %rax.
+ if (Subtarget->is64Bit() &&
+ DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned Reg = FuncInfo->getSRetReturnReg();
+ assert(Reg &&
+ "SRetReturnReg should have been set in LowerFormalArguments().");
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+
+ Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
+ Flag = Chain.getValue(1);
+
+ // RAX now acts like a return value.
+ MRI.addLiveOut(X86::RAX);
+ }
+
+ RetOps[0] = Chain; // Update chain.
+
+ // Add the flag if we have it.
+ if (Flag.getNode())
+ RetOps.push_back(Flag);
+
+ return DAG.getNode(X86ISD::RET_FLAG, dl,
+ MVT::Other, &RetOps[0], RetOps.size());
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
+X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const {
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ bool Is64Bit = Subtarget->is64Bit();
+ CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+ RVLocs, *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ EVT CopyVT = VA.getValVT();
+
+ // If this is x86-64, and we disabled SSE, we can't return FP values
+ if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+ ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+ report_fatal_error("SSE register return with SSE disabled");
+ }
+
+ // If this is a call to a function that returns an fp value on the floating
+ // point stack, but where we prefer to use the value in xmm registers, copy
+ // it out as F80 and use a truncate to move it from fp stack reg to xmm reg.
+ if ((VA.getLocReg() == X86::ST0 ||
+ VA.getLocReg() == X86::ST1) &&
+ isScalarFPTypeInSSEReg(VA.getValVT())) {
+ CopyVT = MVT::f80;
+ }
+
+ SDValue Val;
+ if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
+ // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
+ if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
+ Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+ MVT::v2i64, InFlag).getValue(1);
+ Val = Chain.getValue(0);
+ Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+ Val, DAG.getConstant(0, MVT::i64));
+ } else {
+ Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+ MVT::i64, InFlag).getValue(1);
+ Val = Chain.getValue(0);
+ }
+ Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val);
+ } else {
+ Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+ CopyVT, InFlag).getValue(1);
+ Val = Chain.getValue(0);
+ }
+ InFlag = Chain.getValue(2);
+
+ if (CopyVT != VA.getValVT()) {
+ // Round the F80 the right size, which also moves to the appropriate xmm
+ // register.
+ Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+ // This truncation won't change the value.
+ DAG.getIntPtrConstant(1));
+ }
+
+ InVals.push_back(Val);
+ }
+
+ return Chain;
+}
+
+
+//===----------------------------------------------------------------------===//
+// C & StdCall & Fast Calling Convention implementation
+//===----------------------------------------------------------------------===//
+// StdCall calling convention seems to be standard for many Windows' API
+// routines and around. It differs from C calling convention just a little:
+// callee should clean up the stack, not caller. Symbols should be also
+// decorated in some fancy way :) It doesn't support any vector arguments.
+// For info on fast calling convention see Fast Calling Convention (tail call)
+// implementation LowerX86_32FastCCCallTo.
+
+/// CallIsStructReturn - Determines whether a call uses struct return
+/// semantics.
+static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+ if (Outs.empty())
+ return false;
+
+ return Outs[0].Flags.isSRet();
+}
+
+/// ArgsAreStructReturn - Determines whether a function uses struct
+/// return semantics.
+static bool
+ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
+ if (Ins.empty())
+ return false;
+
+ return Ins[0].Flags.isSRet();
+}
+
+/// IsCalleePop - Determines whether the callee is required to pop its
+/// own arguments. Callee pop is necessary to support tail calls.
+bool X86TargetLowering::IsCalleePop(bool IsVarArg,
+ CallingConv::ID CallingConv) const {
+ if (IsVarArg)
+ return false;
+
+ switch (CallingConv) {
+ default:
+ return false;
+ case CallingConv::X86_StdCall:
+ return !Subtarget->is64Bit();
+ case CallingConv::X86_FastCall:
+ return !Subtarget->is64Bit();
+ case CallingConv::X86_ThisCall:
+ return !Subtarget->is64Bit();
+ case CallingConv::Fast:
+ return GuaranteedTailCallOpt;
+ case CallingConv::GHC:
+ return GuaranteedTailCallOpt;
+ }
+}
+
+/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
+/// given CallingConvention value.
+CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
+ if (Subtarget->is64Bit()) {
+ if (CC == CallingConv::GHC)
+ return CC_X86_64_GHC;
+ else if (Subtarget->isTargetWin64())
+ return CC_X86_Win64_C;
+ else
+ return CC_X86_64_C;
+ }
+
+ if (CC == CallingConv::X86_FastCall)
+ return CC_X86_32_FastCall;
+ else if (CC == CallingConv::X86_ThisCall)
+ return CC_X86_32_ThisCall;
+ else if (CC == CallingConv::Fast)
+ return CC_X86_32_FastCC;
+ else if (CC == CallingConv::GHC)
+ return CC_X86_32_GHC;
+ else
+ return CC_X86_32_C;
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" with size and alignment information specified by
+/// the specific parameter attribute. The copy will be passed as a byval
+/// function parameter.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+ ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+ DebugLoc dl) {
+ SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+ return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+ /*isVolatile*/false, /*AlwaysInline=*/true,
+ NULL, 0, NULL, 0);
+}
+
+/// IsTailCallConvention - Return true if the calling convention is one that
+/// supports tail call optimization.
+static bool IsTailCallConvention(CallingConv::ID CC) {
+ return (CC == CallingConv::Fast || CC == CallingConv::GHC);
+}
+
+/// FuncIsMadeTailCallSafe - Return true if the function is being made into
+/// a tailcall target by changing its ABI.
+static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) {
+ return GuaranteedTailCallOpt && IsTailCallConvention(CC);
+}
+
+SDValue
+X86TargetLowering::LowerMemArgument(SDValue Chain,
+ CallingConv::ID CallConv,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ MachineFrameInfo *MFI,
+ unsigned i) const {
+ // Create the nodes corresponding to a load from this parameter slot.
+ ISD::ArgFlagsTy Flags = Ins[i].Flags;
+ bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv);
+ bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
+ EVT ValVT;
+
+ // If value is passed by pointer we have address passed instead of the value
+ // itself.
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ ValVT = VA.getLocVT();
+ else
+ ValVT = VA.getValVT();
+
+ // FIXME: For now, all byval parameter objects are marked mutable. This can be
+ // changed with more analysis.
+ // In case of tail call optimization mark all arguments mutable. Since they
+ // could be overwritten by lowering of arguments in case of a tail call.
+ if (Flags.isByVal()) {
+ int FI = MFI->CreateFixedObject(Flags.getByValSize(),
+ VA.getLocMemOffset(), isImmutable, false);
+ return DAG.getFrameIndex(FI, getPointerTy());
+ } else {
+ int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
+ VA.getLocMemOffset(), isImmutable, false);
+ SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+ return DAG.getLoad(ValVT, dl, Chain, FIN,
+ PseudoSourceValue::getFixedStack(FI), 0,
+ false, false, 0);
+ }
+}
+
+SDValue
+X86TargetLowering::LowerFormalArguments(SDValue Chain,
+ CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl,
+ SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals)
+ const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+ const Function* Fn = MF.getFunction();
+ if (Fn->hasExternalLinkage() &&
+ Subtarget->isTargetCygMing() &&
+ Fn->getName() == "main")
+ FuncInfo->setForceFramePointer(true);
+
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ bool Is64Bit = Subtarget->is64Bit();
+ bool IsWin64 = Subtarget->isTargetWin64();
+
+ assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
+ "Var args not supported with calling convention fastcc or ghc");
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+ ArgLocs, *DAG.getContext());
+ CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
+
+ unsigned LastVal = ~0U;
+ SDValue ArgValue;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+ // places.
+ assert(VA.getValNo() != LastVal &&
+ "Don't support value assigned to multiple locs yet");
+ LastVal = VA.getValNo();
+
+ if (VA.isRegLoc()) {
+ EVT RegVT = VA.getLocVT();
+ TargetRegisterClass *RC = NULL;
+ if (RegVT == MVT::i32)
+ RC = X86::GR32RegisterClass;
+ else if (Is64Bit && RegVT == MVT::i64)
+ RC = X86::GR64RegisterClass;
+ else if (RegVT == MVT::f32)
+ RC = X86::FR32RegisterClass;
+ else if (RegVT == MVT::f64)
+ RC = X86::FR64RegisterClass;
+ else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
+ RC = X86::VR128RegisterClass;
+ else if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
+ RC = X86::VR64RegisterClass;
+ else
+ llvm_unreachable("Unknown argument type!");
+
+ unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+ // If this is an 8 or 16-bit value, it is really passed promoted to 32
+ // bits. Insert an assert[sz]ext to capture this, then truncate to the
+ // right size.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::BCvt)
+ ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
+
+ if (VA.isExtInLoc()) {
+ // Handle MMX values passed in XMM regs.
+ if (RegVT.isVector()) {
+ ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+ ArgValue, DAG.getConstant(0, MVT::i64));
+ ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
+ } else
+ ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+ }
+ } else {
+ assert(VA.isMemLoc());
+ ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
+ }
+
+ // If value is passed via pointer - do a load.
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0,
+ false, false, 0);
+
+ InVals.push_back(ArgValue);
+ }
+
+ // The x86-64 ABI for returning structs by value requires that we copy
+ // the sret argument into %rax for the return. Save the argument into
+ // a virtual register so that we can access it from the return points.
+ if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned Reg = FuncInfo->getSRetReturnReg();
+ if (!Reg) {
+ Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+ FuncInfo->setSRetReturnReg(Reg);
+ }
+ SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+ }
+
+ unsigned StackSize = CCInfo.getNextStackOffset();
+ // Align stack specially for tail calls.
+ if (FuncIsMadeTailCallSafe(CallConv))
+ StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
+
+ // If the function takes variable number of arguments, make a frame index for
+ // the start of the first vararg value... for expansion of llvm.va_start.
+ if (isVarArg) {
+ if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
+ CallConv != CallingConv::X86_ThisCall)) {
+ FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,
+ true, false));
+ }
+ if (Is64Bit) {
+ unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
+
+ // FIXME: We should really autogenerate these arrays
+ static const unsigned GPR64ArgRegsWin64[] = {
+ X86::RCX, X86::RDX, X86::R8, X86::R9
+ };
+ static const unsigned XMMArgRegsWin64[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
+ };
+ static const unsigned GPR64ArgRegs64Bit[] = {
+ X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+ };
+ static const unsigned XMMArgRegs64Bit[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ const unsigned *GPR64ArgRegs, *XMMArgRegs;
+
+ if (IsWin64) {
+ TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
+ GPR64ArgRegs = GPR64ArgRegsWin64;
+ XMMArgRegs = XMMArgRegsWin64;
+ } else {
+ TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
+ GPR64ArgRegs = GPR64ArgRegs64Bit;
+ XMMArgRegs = XMMArgRegs64Bit;
+ }
+ unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
+ TotalNumIntRegs);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
+ TotalNumXMMRegs);
+
+ bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
+ assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
+ "SSE register cannot be used when SSE is disabled!");
+ assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
+ "SSE register cannot be used when SSE is disabled!");
+ if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
+ // Kernel mode asks for SSE to be disabled, so don't push them
+ // on the stack.
+ TotalNumXMMRegs = 0;
+
+ // For X86-64, if there are vararg parameters that are passed via
+ // registers, then we must store them to their spots on the stack so they
+ // may be loaded by deferencing the result of va_next.
+ FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+ FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
+ FuncInfo->setRegSaveFrameIndex(
+ MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
+ false));
+
+ // Store the integer parameter registers.
+ SmallVector<SDValue, 8> MemOps;
+ SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+ getPointerTy());
+ unsigned Offset = FuncInfo->getVarArgsGPOffset();
+ for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
+ SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
+ DAG.getIntPtrConstant(Offset));
+ unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
+ X86::GR64RegisterClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), dl, Val, FIN,
+ PseudoSourceValue::getFixedStack(
+ FuncInfo->getRegSaveFrameIndex()),
+ Offset, false, false, 0);
+ MemOps.push_back(Store);
+ Offset += 8;
+ }
+
+ if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
+ // Now store the XMM (fp + vector) parameter registers.
+ SmallVector<SDValue, 11> SaveXMMOps;
+ SaveXMMOps.push_back(Chain);
+
+ unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
+ SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
+ SaveXMMOps.push_back(ALVal);
+
+ SaveXMMOps.push_back(DAG.getIntPtrConstant(
+ FuncInfo->getRegSaveFrameIndex()));
+ SaveXMMOps.push_back(DAG.getIntPtrConstant(
+ FuncInfo->getVarArgsFPOffset()));
+
+ for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
+ unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs],
+ X86::VR128RegisterClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
+ SaveXMMOps.push_back(Val);
+ }
+ MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
+ MVT::Other,
+ &SaveXMMOps[0], SaveXMMOps.size()));
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ &MemOps[0], MemOps.size());
+ }
+ }
+
+ // Some CCs need callee pop.
+ if (IsCalleePop(isVarArg, CallConv)) {
+ FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
+ } else {
+ FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
+ // If this is an sret function, the return should pop the hidden pointer.
+ if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins))
+ FuncInfo->setBytesToPopOnReturn(4);
+ }
+
+ if (!Is64Bit) {
+ // RegSaveFrameIndex is X86-64 only.
+ FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
+ if (CallConv == CallingConv::X86_FastCall ||
+ CallConv == CallingConv::X86_ThisCall)
+ // fastcc functions can't have varargs.
+ FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
+ }
+
+ return Chain;
+}
+
+SDValue
+X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
+ SDValue StackPtr, SDValue Arg,
+ DebugLoc dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags) const {
+ const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0);
+ unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset();
+ SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+ PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+ if (Flags.isByVal()) {
+ return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
+ }
+ return DAG.getStore(Chain, dl, Arg, PtrOff,
+ PseudoSourceValue::getStack(), LocMemOffset,
+ false, false, 0);
+}
+
+/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
+/// optimization is performed and it is required.
+SDValue
+X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
+ SDValue &OutRetAddr, SDValue Chain,
+ bool IsTailCall, bool Is64Bit,
+ int FPDiff, DebugLoc dl) const {
+ // Adjust the Return address stack slot.
+ EVT VT = getPointerTy();
+ OutRetAddr = getReturnAddressFrameIndex(DAG);
+
+ // Load the "old" Return address.
+ OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0);
+ return SDValue(OutRetAddr.getNode(), 1);
+}
+
+/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
+/// optimization is performed and it is required (FPDiff!=0).
+static SDValue
+EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
+ SDValue Chain, SDValue RetAddrFrIdx,
+ bool Is64Bit, int FPDiff, DebugLoc dl) {
+ // Store the return address to the appropriate stack slot.
+ if (!FPDiff) return Chain;
+ // Calculate the new stack slot for the return address.
+ int SlotSize = Is64Bit ? 8 : 4;
+ int NewReturnAddrFI =
+ MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false, false);
+ EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
+ SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
+ Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
+ PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0,
+ false, false, 0);
+ return Chain;
+}
+
+SDValue
+X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+ CallingConv::ID CallConv, bool isVarArg,
+ bool &isTailCall,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ bool Is64Bit = Subtarget->is64Bit();
+ bool IsStructRet = CallIsStructReturn(Outs);
+ bool IsSibcall = false;
+
+ if (isTailCall) {
+ // Check if it's really possible to do a tail call.
+ isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+ isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
+ Outs, Ins, DAG);
+
+ // Sibcalls are automatically detected tailcalls which do not require
+ // ABI changes.
+ if (!GuaranteedTailCallOpt && isTailCall)
+ IsSibcall = true;
+
+ if (isTailCall)
+ ++NumTailCalls;
+ }
+
+ assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
+ "Var args not supported with calling convention fastcc or ghc");
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+ ArgLocs, *DAG.getContext());
+ CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+ if (IsSibcall)
+ // This is a sibcall. The memory operands are available in caller's
+ // own caller's stack.
+ NumBytes = 0;
+ else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv))
+ NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
+
+ int FPDiff = 0;
+ if (isTailCall && !IsSibcall) {
+ // Lower arguments at fp - stackoffset + fpdiff.
+ unsigned NumBytesCallerPushed =
+ MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
+ FPDiff = NumBytesCallerPushed - NumBytes;
+
+ // Set the delta of movement of the returnaddr stackslot.
+ // But only set if delta is greater than previous delta.
+ if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
+ MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
+ }
+
+ if (!IsSibcall)
+ Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+ SDValue RetAddrFrIdx;
+ // Load return adress for tail calls.
+ if (isTailCall && FPDiff)
+ Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
+ Is64Bit, FPDiff, dl);
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
+ SDValue StackPtr;
+
+ // Walk the register/memloc assignments, inserting copies/loads. In the case
+ // of tail call optimization arguments are handle later.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ EVT RegVT = VA.getLocVT();
+ SDValue Arg = Outs[i].Val;
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ bool isByVal = Flags.isByVal();
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: llvm_unreachable("Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
+ break;
+ case CCValAssign::AExt:
+ if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
+ // Special case: passing MMX values in XMM registers.
+ Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
+ Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
+ Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
+ } else
+ Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg);
+ break;
+ case CCValAssign::Indirect: {
+ // Store the argument.
+ SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
+ int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+ Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
+ PseudoSourceValue::getFixedStack(FI), 0,
+ false, false, 0);
+ Arg = SpillSlot;
+ break;
+ }
+ }
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else if (!IsSibcall && (!isTailCall || isByVal)) {
+ assert(VA.isMemLoc());
+ if (StackPtr.getNode() == 0)
+ StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
+ MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+ dl, DAG, VA, Flags));
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ &MemOpChains[0], MemOpChains.size());
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDValue InFlag;
+ // Tail call byval lowering might overwrite argument registers so in case of
+ // tail call optimization the copies to registers are lowered later.
+ if (!isTailCall)
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ if (Subtarget->isPICStyleGOT()) {
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer.
+ if (!isTailCall) {
+ Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc(), getPointerTy()),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ } else {
+ // If we are tail calling and generating PIC/GOT style code load the
+ // address of the callee into ECX. The value in ecx is used as target of
+ // the tail jump. This is done to circumvent the ebx/callee-saved problem
+ // for tail calls on PIC/GOT architectures. Normally we would just put the
+ // address of GOT into ebx and then call target@PLT. But for tail calls
+ // ebx would be restored (since ebx is callee saved) before jumping to the
+ // target@PLT.
+
+ // Note: The actual moving to ECX is done further down.
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if (G && !G->getGlobal()->hasHiddenVisibility() &&
+ !G->getGlobal()->hasProtectedVisibility())
+ Callee = LowerGlobalAddress(Callee, DAG);
+ else if (isa<ExternalSymbolSDNode>(Callee))
+ Callee = LowerExternalSymbol(Callee, DAG);
+ }
+ }
+
+ if (Is64Bit && isVarArg) {
+ // From AMD64 ABI document:
+ // For calls that may call functions that use varargs or stdargs
+ // (prototype-less calls or calls to functions containing ellipsis (...) in
+ // the declaration) %al is used as hidden argument to specify the number
+ // of SSE registers used. The contents of %al do not need to match exactly
+ // the number of registers, but must be an ubound on the number of SSE
+ // registers used and is in the range 0 - 8 inclusive.
+
+ // FIXME: Verify this on Win64
+ // Count the number of XMM registers allocated.
+ static const unsigned XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+ assert((Subtarget->hasSSE1() || !NumXMMRegs)
+ && "SSE registers cannot be used when SSE is disabled");
+
+ Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
+ DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+
+ // For tail calls lower the arguments to the 'real' stack slot.
+ if (isTailCall) {
+ // Force all the incoming stack arguments to be loaded from the stack
+ // before any new outgoing arguments are stored to the stack, because the
+ // outgoing stack slots may alias the incoming argument stack slots, and
+ // the alias isn't otherwise explicit. This is slightly more conservative
+ // than necessary, because it means that each store effectively depends
+ // on every argument instead of just those arguments it would clobber.
+ SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
+
+ SmallVector<SDValue, 8> MemOpChains2;
+ SDValue FIN;
+ int FI = 0;
+ // Do not flag preceeding copytoreg stuff together with the following stuff.
+ InFlag = SDValue();
+ if (GuaranteedTailCallOpt) {
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (VA.isRegLoc())
+ continue;
+ assert(VA.isMemLoc());
+ SDValue Arg = Outs[i].Val;
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ // Create frame index.
+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
+ uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false);
+ FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+ if (Flags.isByVal()) {
+ // Copy relative to framepointer.
+ SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
+ if (StackPtr.getNode() == 0)
+ StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
+ getPointerTy());
+ Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+
+ MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+ ArgChain,
+ Flags, DAG, dl));
+ } else {
+ // Store relative to framepointer.
+ MemOpChains2.push_back(
+ DAG.getStore(ArgChain, dl, Arg, FIN,
+ PseudoSourceValue::getFixedStack(FI), 0,
+ false, false, 0));
+ }
+ }
+ }
+
+ if (!MemOpChains2.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ &MemOpChains2[0], MemOpChains2.size());
+
+ // Copy arguments to their registers.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+ RegsToPass[i].second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+ InFlag =SDValue();
+
+ // Store the return address to the appropriate stack slot.
+ Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
+ FPDiff, dl);
+ }
+
+ bool WasGlobalOrExternal = false;
+ if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+ assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
+ // In the 64-bit large code model, we have to make all calls
+ // through a register, since the call instruction's 32-bit
+ // pc-relative offset may not be large enough to hold the whole
+ // address.
+ } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ WasGlobalOrExternal = true;
+ // If the callee is a GlobalAddress node (quite common, every direct call
+ // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
+ // it.
+
+ // We should use extra load for direct calls to dllimported functions in
+ // non-JIT mode.
+ const GlobalValue *GV = G->getGlobal();
+ if (!GV->hasDLLImportLinkage()) {
+ unsigned char OpFlags = 0;
+
+ // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+ // external symbols most go through the PLT in PIC mode. If the symbol
+ // has hidden or protected visibility, or if it is static or local, then
+ // we don't need to use the PLT - we can directly call it.
+ if (Subtarget->isTargetELF() &&
+ getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+ OpFlags = X86II::MO_PLT;
+ } else if (Subtarget->isPICStyleStubAny() &&
+ (GV->isDeclaration() || GV->isWeakForLinker()) &&
+ Subtarget->getDarwinVers() < 9) {
+ // PC-relative references to external symbols should go through $stub,
+ // unless we're building with the leopard linker or later, which
+ // automatically synthesizes these stubs.
+ OpFlags = X86II::MO_DARWIN_STUB;
+ }
+
+ Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(),
+ G->getOffset(), OpFlags);
+ }
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ WasGlobalOrExternal = true;
+ unsigned char OpFlags = 0;
+
+ // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external
+ // symbols should go through the PLT.
+ if (Subtarget->isTargetELF() &&
+ getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+ OpFlags = X86II::MO_PLT;
+ } else if (Subtarget->isPICStyleStubAny() &&
+ Subtarget->getDarwinVers() < 9) {
+ // PC-relative references to external symbols should go through $stub,
+ // unless we're building with the leopard linker or later, which
+ // automatically synthesizes these stubs.
+ OpFlags = X86II::MO_DARWIN_STUB;
+ }
+
+ Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
+ OpFlags);
+ }
+
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SmallVector<SDValue, 8> Ops;
+
+ if (!IsSibcall && isTailCall) {
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+ DAG.getIntPtrConstant(0, true), InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ if (isTailCall)
+ Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ // Add an implicit use GOT pointer in EBX.
+ if (!isTailCall && Subtarget->isPICStyleGOT())
+ Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
+
+ // Add an implicit use of AL for x86 vararg functions.
+ if (Is64Bit && isVarArg)
+ Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ if (isTailCall) {
+ // If this is the first return lowered for this function, add the regs
+ // to the liveout set for the function.
+ if (MF.getRegInfo().liveout_empty()) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+ for (unsigned i = 0; i != RVLocs.size(); ++i)
+ if (RVLocs[i].isRegLoc())
+ MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+ }
+ return DAG.getNode(X86ISD::TC_RETURN, dl,
+ NodeTys, &Ops[0], Ops.size());
+ }
+
+ Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+ InFlag = Chain.getValue(1);
+
+ // Create the CALLSEQ_END node.
+ unsigned NumBytesForCalleeToPush;
+ if (IsCalleePop(isVarArg, CallConv))
+ NumBytesForCalleeToPush = NumBytes; // Callee pops everything
+ else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet)
+ // If this is a call to a struct-return function, the callee
+ // pops the hidden struct pointer, so we have to push it back.
+ // This is common for Darwin/X86, Linux & Mingw32 targets.
+ NumBytesForCalleeToPush = 4;
+ else
+ NumBytesForCalleeToPush = 0; // Callee pops nothing.
+
+ // Returns a flag for retval copy to use.
+ if (!IsSibcall) {
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getIntPtrConstant(NumBytes, true),
+ DAG.getIntPtrConstant(NumBytesForCalleeToPush,
+ true),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+ Ins, dl, DAG, InVals);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+// Like std call, callee cleans arguments, convention except that ECX is
+// reserved for storing the tail called function address. Only 2 registers are
+// free for argument passing (inreg). Tail call optimization is performed
+// provided:
+// * tailcallopt is enabled
+// * caller/callee are fastcc
+// On X86_64 architecture with GOT-style position independent code only local
+// (within module) calls are supported at the moment.
+// To keep the stack aligned according to platform abi the function
+// GetAlignedArgumentStackSize ensures that argument delta is always multiples
+// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
+// If a tail called function callee has more arguments than the caller the
+// caller needs to make sure that there is room to move the RETADDR to. This is
+// achieved by reserving an area the size of the argument delta right after the
+// original REtADDR, but before the saved framepointer or the spilled registers
+// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
+// stack layout:
+// arg1
+// arg2
+// RETADDR
+// [ new RETADDR
+// move area ]
+// (possible EBP)
+// ESI
+// EDI
+// local1 ..
+
+/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
+/// for a 16 byte align requirement.
+unsigned
+X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG& DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetMachine &TM = MF.getTarget();
+ const TargetFrameInfo &TFI = *TM.getFrameInfo();
+ unsigned StackAlignment = TFI.getStackAlignment();
+ uint64_t AlignMask = StackAlignment - 1;
+ int64_t Offset = StackSize;
+ uint64_t SlotSize = TD->getPointerSize();
+ if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
+ // Number smaller than 12 so just add the difference.
+ Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
+ } else {
+ // Mask out lower bits, add stackalignment once plus the 12 bytes.
+ Offset = ((~AlignMask) & Offset) + StackAlignment +
+ (StackAlignment-SlotSize);
+ }
+ return Offset;
+}
+
+/// MatchingStackOffset - Return true if the given stack call argument is
+/// already available in the same position (relatively) of the caller's
+/// incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+ MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
+ const X86InstrInfo *TII) {
+ unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
+ int FI = INT_MAX;
+ if (Arg.getOpcode() == ISD::CopyFromReg) {
+ unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+ if (!VR || TargetRegisterInfo::isPhysicalRegister(VR))
+ return false;
+ MachineInstr *Def = MRI->getVRegDef(VR);
+ if (!Def)
+ return false;
+ if (!Flags.isByVal()) {
+ if (!TII->isLoadFromStackSlot(Def, FI))
+ return false;
+ } else {
+ unsigned Opcode = Def->getOpcode();
+ if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
+ Def->getOperand(1).isFI()) {
+ FI = Def->getOperand(1).getIndex();
+ Bytes = Flags.getByValSize();
+ } else
+ return false;
+ }
+ } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+ if (Flags.isByVal())
+ // ByVal argument is passed in as a pointer but it's now being
+ // dereferenced. e.g.
+ // define @foo(%struct.X* %A) {
+ // tail call @bar(%struct.X* byval %A)
+ // }
+ return false;
+ SDValue Ptr = Ld->getBasePtr();
+ FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+ if (!FINode)
+ return false;
+ FI = FINode->getIndex();
+ } else
+ return false;
+
+ assert(FI != INT_MAX);
+ if (!MFI->isFixedObjectIndex(FI))
+ return false;
+ return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+ CallingConv::ID CalleeCC,
+ bool isVarArg,
+ bool isCalleeStructRet,
+ bool isCallerStructRet,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SelectionDAG& DAG) const {
+ if (!IsTailCallConvention(CalleeCC) &&
+ CalleeCC != CallingConv::C)
+ return false;
+
+ // If -tailcallopt is specified, make fastcc functions tail-callable.
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const Function *CallerF = DAG.getMachineFunction().getFunction();
+ CallingConv::ID CallerCC = CallerF->getCallingConv();
+ bool CCMatch = CallerCC == CalleeCC;
+
+ if (GuaranteedTailCallOpt) {
+ if (IsTailCallConvention(CalleeCC) && CCMatch)
+ return true;
+ return false;
+ }
+
+ // Look for obvious safe cases to perform tail call optimization that does not
+ // requite ABI changes. This is what gcc calls sibcall.
+
+ // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
+ // emit a special epilogue.
+ if (RegInfo->needsStackRealignment(MF))
+ return false;
+
+ // Do not sibcall optimize vararg calls unless the call site is not passing any
+ // arguments.
+ if (isVarArg && !Outs.empty())
+ return false;
+
+ // Also avoid sibcall optimization if either caller or callee uses struct
+ // return semantics.
+ if (isCalleeStructRet || isCallerStructRet)
+ return false;
+
+ // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack.
+ // Therefore if it's not used by the call it is not safe to optimize this into
+ // a sibcall.
+ bool Unused = false;
+ for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+ if (!Ins[i].Used) {
+ Unused = true;
+ break;
+ }
+ }
+ if (Unused) {
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CalleeCC, false, getTargetMachine(),
+ RVLocs, *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+ for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+ CCValAssign &VA = RVLocs[i];
+ if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+ return false;
+ }
+ }
+
+ // If the calling conventions do not match, then we'd better make sure the
+ // results are returned in the same way as what the caller expects.
+ if (!CCMatch) {
+ SmallVector<CCValAssign, 16> RVLocs1;
+ CCState CCInfo1(CalleeCC, false, getTargetMachine(),
+ RVLocs1, *DAG.getContext());
+ CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
+
+ SmallVector<CCValAssign, 16> RVLocs2;
+ CCState CCInfo2(CallerCC, false, getTargetMachine(),
+ RVLocs2, *DAG.getContext());
+ CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
+
+ if (RVLocs1.size() != RVLocs2.size())
+ return false;
+ for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+ if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+ return false;
+ if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+ return false;
+ if (RVLocs1[i].isRegLoc()) {
+ if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
+ return false;
+ } else {
+ if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
+ return false;
+ }
+ }
+ }
+
+ // If the callee takes no arguments then go on to check the results of the
+ // call.
+ if (!Outs.empty()) {
+ // Check if stack adjustment is needed. For now, do not do this if any
+ // argument is passed on the stack.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
+ ArgLocs, *DAG.getContext());
+ CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
+ if (CCInfo.getNextStackOffset()) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
+ return false;
+ if (Subtarget->isTargetWin64())
+ // Win64 ABI has additional complications.
+ return false;
+
+ // Check if the arguments are already laid out in the right way as
+ // the caller's fixed stack objects.
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineRegisterInfo *MRI = &MF.getRegInfo();
+ const X86InstrInfo *TII =
+ ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ EVT RegVT = VA.getLocVT();
+ SDValue Arg = Outs[i].Val;
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
+ if (VA.getLocInfo() == CCValAssign::Indirect)
+ return false;
+ if (!VA.isRegLoc()) {
+ if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+ MFI, MRI, TII))
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+FastISel *
+X86TargetLowering::createFastISel(MachineFunction &mf,
+ DenseMap<const Value *, unsigned> &vm,
+ DenseMap<const BasicBlock*, MachineBasicBlock*> &bm,
+ DenseMap<const AllocaInst *, int> &am,
+ std::vector<std::pair<MachineInstr*, unsigned> > &pn
+#ifndef NDEBUG
+ , SmallSet<const Instruction *, 8> &cil
+#endif
+ ) const {
+ return X86::createFastISel(mf, vm, bm, am, pn
+#ifndef NDEBUG
+ , cil
+#endif
+ );
+}
+
+
+//===----------------------------------------------------------------------===//
+// Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+
+SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+ if (ReturnAddrIndex == 0) {
+ // Set up a frame object for the return address.
+ uint64_t SlotSize = TD->getPointerSize();
+ ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
+ false, false);
+ FuncInfo->setRAIndex(ReturnAddrIndex);
+ }
+
+ return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+}
+
+
+bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+ bool hasSymbolicDisplacement) {
+ // Offset should fit into 32 bit immediate field.
+ if (!isInt<32>(Offset))
+ return false;
+
+ // If we don't have a symbolic displacement - we don't have any extra
+ // restrictions.
+ if (!hasSymbolicDisplacement)
+ return true;
+
+ // FIXME: Some tweaks might be needed for medium code model.
+ if (M != CodeModel::Small && M != CodeModel::Kernel)
+ return false;
+
+ // For small code model we assume that latest object is 16MB before end of 31
+ // bits boundary. We may also accept pretty large negative constants knowing
+ // that all objects are in the positive half of address space.
+ if (M == CodeModel::Small && Offset < 16*1024*1024)
+ return true;
+
+ // For kernel code model we know that all object resist in the negative half
+ // of 32bits address space. We may not accept negative offsets, since they may
+ // be just off and we may accept pretty large positive ones.
+ if (M == CodeModel::Kernel && Offset > 0)
+ return true;
+
+ return false;
+}
+
+/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
+/// specific condition code, returning the condition code and the LHS/RHS of the
+/// comparison to make.
+static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
+ SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
+ if (!isFP) {
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+ if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+ // X > -1 -> X == 0, jump !sign.
+ RHS = DAG.getConstant(0, RHS.getValueType());
+ return X86::COND_NS;
+ } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+ // X < 0 -> X == 0, jump on sign.
+ return X86::COND_S;
+ } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+ // X < 1 -> X <= 0
+ RHS = DAG.getConstant(0, RHS.getValueType());
+ return X86::COND_LE;
+ }
+ }
+
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Invalid integer condition!");
+ case ISD::SETEQ: return X86::COND_E;
+ case ISD::SETGT: return X86::COND_G;
+ case ISD::SETGE: return X86::COND_GE;
+ case ISD::SETLT: return X86::COND_L;
+ case ISD::SETLE: return X86::COND_LE;
+ case ISD::SETNE: return X86::COND_NE;
+ case ISD::SETULT: return X86::COND_B;
+ case ISD::SETUGT: return X86::COND_A;
+ case ISD::SETULE: return X86::COND_BE;
+ case ISD::SETUGE: return X86::COND_AE;
+ }
+ }
+
+ // First determine if it is required or is profitable to flip the operands.
+
+ // If LHS is a foldable load, but RHS is not, flip the condition.
+ if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
+ !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
+ SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
+ std::swap(LHS, RHS);
+ }
+
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETOLT:
+ case ISD::SETOLE:
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ std::swap(LHS, RHS);
+ break;
+ }
+
+ // On a floating point condition, the flags are set as follows:
+ // ZF PF CF op
+ // 0 | 0 | 0 | X > Y
+ // 0 | 0 | 1 | X < Y
+ // 1 | 0 | 0 | X == Y
+ // 1 | 1 | 1 | unordered
+ switch (SetCCOpcode) {
+ default: llvm_unreachable("Condcode should be pre-legalized away");
+ case ISD::SETUEQ:
+ case ISD::SETEQ: return X86::COND_E;
+ case ISD::SETOLT: // flipped
+ case ISD::SETOGT:
+ case ISD::SETGT: return X86::COND_A;
+ case ISD::SETOLE: // flipped
+ case ISD::SETOGE:
+ case ISD::SETGE: return X86::COND_AE;
+ case ISD::SETUGT: // flipped
+ case ISD::SETULT:
+ case ISD::SETLT: return X86::COND_B;
+ case ISD::SETUGE: // flipped
+ case ISD::SETULE:
+ case ISD::SETLE: return X86::COND_BE;
+ case ISD::SETONE:
+ case ISD::SETNE: return X86::COND_NE;
+ case ISD::SETUO: return X86::COND_P;
+ case ISD::SETO: return X86::COND_NP;
+ case ISD::SETOEQ:
+ case ISD::SETUNE: return X86::COND_INVALID;
+ }
+}
+
+/// hasFPCMov - is there a floating point cmov for the specific X86 condition
+/// code. Current x86 isa includes the following FP cmov instructions:
+/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
+static bool hasFPCMov(unsigned X86CC) {
+ switch (X86CC) {
+ default:
+ return false;
+ case X86::COND_B:
+ case X86::COND_BE:
+ case X86::COND_E:
+ case X86::COND_P:
+ case X86::COND_A:
+ case X86::COND_AE:
+ case X86::COND_NE:
+ case X86::COND_NP:
+ return true;
+ }
+}
+
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+ for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
+ if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
+ return true;
+ }
+ return false;
+}
+
+/// isUndefOrInRange - Return true if Val is undef or if its value falls within
+/// the specified range (L, H].
+static bool isUndefOrInRange(int Val, int Low, int Hi) {
+ return (Val < 0) || (Val >= Low && Val < Hi);
+}
+
+/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
+/// specified value.
+static bool isUndefOrEqual(int Val, int CmpVal) {
+ if (Val < 0 || Val == CmpVal)
+ return true;
+ return false;
+}
+
+/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference
+/// the second operand.
+static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+ if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16)
+ return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
+ if (VT == MVT::v2f64 || VT == MVT::v2i64)
+ return (Mask[0] < 2 && Mask[1] < 2);
+ return false;
+}
+
+bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isPSHUFDMask(M, N->getValueType(0));
+}
+
+/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFHW.
+static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+ if (VT != MVT::v8i16)
+ return false;
+
+ // Lower quadword copied in order or undef.
+ for (int i = 0; i != 4; ++i)
+ if (Mask[i] >= 0 && Mask[i] != i)
+ return false;
+
+ // Upper quadword shuffled.
+ for (int i = 4; i != 8; ++i)
+ if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
+ return false;
+
+ return true;
+}
+
+bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isPSHUFHWMask(M, N->getValueType(0));
+}
+
+/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFLW.
+static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+ if (VT != MVT::v8i16)
+ return false;
+
+ // Upper quadword copied in order.
+ for (int i = 4; i != 8; ++i)
+ if (Mask[i] >= 0 && Mask[i] != i)
+ return false;
+
+ // Lower quadword shuffled.
+ for (int i = 0; i != 4; ++i)
+ if (Mask[i] >= 4)
+ return false;
+
+ return true;
+}
+
+bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isPSHUFLWMask(M, N->getValueType(0));
+}
+
+/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PALIGNR.
+static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ bool hasSSSE3) {
+ int i, e = VT.getVectorNumElements();
+
+ // Do not handle v2i64 / v2f64 shuffles with palignr.
+ if (e < 4 || !hasSSSE3)
+ return false;
+
+ for (i = 0; i != e; ++i)
+ if (Mask[i] >= 0)
+ break;
+
+ // All undef, not a palignr.
+ if (i == e)
+ return false;
+
+ // Determine if it's ok to perform a palignr with only the LHS, since we
+ // don't have access to the actual shuffle elements to see if RHS is undef.
+ bool Unary = Mask[i] < (int)e;
+ bool NeedsUnary = false;
+
+ int s = Mask[i] - i;
+
+ // Check the rest of the elements to see if they are consecutive.
+ for (++i; i != e; ++i) {
+ int m = Mask[i];
+ if (m < 0)
+ continue;
+
+ Unary = Unary && (m < (int)e);
+ NeedsUnary = NeedsUnary || (m < s);
+
+ if (NeedsUnary && !Unary)
+ return false;
+ if (Unary && m != ((s+i) & (e-1)))
+ return false;
+ if (!Unary && m != (s+i))
+ return false;
+ }
+ return true;
+}
+
+bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isPALIGNRMask(M, N->getValueType(0), true);
+}
+
+/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to SHUFP*.
+static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+ int NumElems = VT.getVectorNumElements();
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+
+ int Half = NumElems / 2;
+ for (int i = 0; i < Half; ++i)
+ if (!isUndefOrInRange(Mask[i], 0, NumElems))
+ return false;
+ for (int i = Half; i < NumElems; ++i)
+ if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
+ return false;
+
+ return true;
+}
+
+bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isSHUFPMask(M, N->getValueType(0));
+}
+
+/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
+/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
+/// half elements to come from vector 1 (which would equal the dest.) and
+/// the upper half to come from vector 2.
+static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+ int NumElems = VT.getVectorNumElements();
+
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+
+ int Half = NumElems / 2;
+ for (int i = 0; i < Half; ++i)
+ if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
+ return false;
+ for (int i = Half; i < NumElems; ++i)
+ if (!isUndefOrInRange(Mask[i], 0, NumElems))
+ return false;
+ return true;
+}
+
+static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return isCommutedSHUFPMask(M, N->getValueType(0));
+}
+
+/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
+ if (N->getValueType(0).getVectorNumElements() != 4)
+ return false;
+
+ // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
+ return isUndefOrEqual(N->getMaskElt(0), 6) &&
+ isUndefOrEqual(N->getMaskElt(1), 7) &&
+ isUndefOrEqual(N->getMaskElt(2), 2) &&
+ isUndefOrEqual(N->getMaskElt(3), 3);
+}
+
+/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+/// <2, 3, 2, 3>
+bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
+ unsigned NumElems = N->getValueType(0).getVectorNumElements();
+
+ if (NumElems != 4)
+ return false;
+
+ return isUndefOrEqual(N->getMaskElt(0), 2) &&
+ isUndefOrEqual(N->getMaskElt(1), 3) &&
+ isUndefOrEqual(N->getMaskElt(2), 2) &&
+ isUndefOrEqual(N->getMaskElt(3), 3);
+}
+
+/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
+bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
+ unsigned NumElems = N->getValueType(0).getVectorNumElements();
+
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+
+ for (unsigned i = 0; i < NumElems/2; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
+ return false;
+
+ for (unsigned i = NumElems/2; i < NumElems; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(i), i))
+ return false;
+
+ return true;
+}
+
+/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
+bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
+ unsigned NumElems = N->getValueType(0).getVectorNumElements();
+
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+
+ for (unsigned i = 0; i < NumElems/2; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(i), i))
+ return false;
+
+ for (unsigned i = 0; i < NumElems/2; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
+ return false;
+
+ return true;
+}
+
+/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKL.
+static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ bool V2IsSplat = false) {
+ int NumElts = VT.getVectorNumElements();
+ if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+ return false;
+
+ for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
+ int BitI = Mask[i];
+ int BitI1 = Mask[i+1];
+ if (!isUndefOrEqual(BitI, j))
+ return false;
+ if (V2IsSplat) {
+ if (!isUndefOrEqual(BitI1, NumElts))
+ return false;
+ } else {
+ if (!isUndefOrEqual(BitI1, j + NumElts))
+ return false;
+ }
+ }
+ return true;
+}
+
+bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
+}
+
+/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKH.
+static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ bool V2IsSplat = false) {
+ int NumElts = VT.getVectorNumElements();
+ if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+ return false;
+
+ for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
+ int BitI = Mask[i];
+ int BitI1 = Mask[i+1];
+ if (!isUndefOrEqual(BitI, j + NumElts/2))
+ return false;
+ if (V2IsSplat) {
+ if (isUndefOrEqual(BitI1, NumElts))
+ return false;
+ } else {
+ if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
+ return false;
+ }
+ }
+ return true;
+}
+
+bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
+}
+
+/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+/// <0, 0, 1, 1>
+static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
+ int NumElems = VT.getVectorNumElements();
+ if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+ return false;
+
+ for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
+ int BitI = Mask[i];
+ int BitI1 = Mask[i+1];
+ if (!isUndefOrEqual(BitI, j))
+ return false;
+ if (!isUndefOrEqual(BitI1, j))
+ return false;
+ }
+ return true;
+}
+
+bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
+}
+
+/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+/// <2, 2, 3, 3>
+static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
+ int NumElems = VT.getVectorNumElements();
+ if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+ return false;
+
+ for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
+ int BitI = Mask[i];
+ int BitI1 = Mask[i+1];
+ if (!isUndefOrEqual(BitI, j))
+ return false;
+ if (!isUndefOrEqual(BitI1, j))
+ return false;
+ }
+ return true;
+}
+
+bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
+}
+
+/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSS,
+/// MOVSD, and MOVD, i.e. setting the lowest element.
+static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+ if (VT.getVectorElementType().getSizeInBits() < 32)
+ return false;
+
+ int NumElts = VT.getVectorNumElements();
+
+ if (!isUndefOrEqual(Mask[0], NumElts))
+ return false;
+
+ for (int i = 1; i < NumElts; ++i)
+ if (!isUndefOrEqual(Mask[i], i))
+ return false;
+
+ return true;
+}
+
+bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return ::isMOVLMask(M, N->getValueType(0));
+}
+
+/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
+/// of what x86 movss want. X86 movs requires the lowest element to be lowest
+/// element of vector 2 and the other elements to come from vector 1 in order.
+static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
+ bool V2IsSplat = false, bool V2IsUndef = false) {
+ int NumOps = VT.getVectorNumElements();
+ if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
+ return false;
+
+ if (!isUndefOrEqual(Mask[0], 0))
+ return false;
+
+ for (int i = 1; i < NumOps; ++i)
+ if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
+ (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
+ (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
+ return false;
+
+ return true;
+}
+
+static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
+ bool V2IsUndef = false) {
+ SmallVector<int, 8> M;
+ N->getMask(M);
+ return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
+}
+
+/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
+ if (N->getValueType(0).getVectorNumElements() != 4)
+ return false;
+
+ // Expect 1, 1, 3, 3
+ for (unsigned i = 0; i < 2; ++i) {
+ int Elt = N->getMaskElt(i);
+ if (Elt >= 0 && Elt != 1)
+ return false;
+ }
+
+ bool HasHi = false;
+ for (unsigned i = 2; i < 4; ++i) {
+ int Elt = N->getMaskElt(i);
+ if (Elt >= 0 && Elt != 3)
+ return false;
+ if (Elt == 3)
+ HasHi = true;
+ }
+ // Don't use movshdup if it can be done with a shufps.
+ // FIXME: verify that matching u, u, 3, 3 is what we want.
+ return HasHi;
+}
+
+/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
+ if (N->getValueType(0).getVectorNumElements() != 4)
+ return false;
+
+ // Expect 0, 0, 2, 2
+ for (unsigned i = 0; i < 2; ++i)
+ if (N->getMaskElt(i) > 0)
+ return false;
+
+ bool HasHi = false;
+ for (unsigned i = 2; i < 4; ++i) {
+ int Elt = N->getMaskElt(i);
+ if (Elt >= 0 && Elt != 2)
+ return false;
+ if (Elt == 2)
+ HasHi = true;
+ }
+ // Don't use movsldup if it can be done with a shufps.
+ return HasHi;
+}
+
+/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
+bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
+ int e = N->getValueType(0).getVectorNumElements() / 2;
+
+ for (int i = 0; i < e; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(i), i))
+ return false;
+ for (int i = 0; i < e; ++i)
+ if (!isUndefOrEqual(N->getMaskElt(e+i), i))
+ return false;
+ return true;
+}
+
+/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
+unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ int NumOperands = SVOp->getValueType(0).getVectorNumElements();
+
+ unsigned Shift = (NumOperands == 4) ? 2 : 1;
+ unsigned Mask = 0;
+ for (int i = 0; i < NumOperands; ++i) {
+ int Val = SVOp->getMaskElt(NumOperands-i-1);
+ if (Val < 0) Val = 0;
+ if (Val >= NumOperands) Val -= NumOperands;
+ Mask |= Val;
+ if (i != NumOperands - 1)
+ Mask <<= Shift;
+ }
+ return Mask;
+}
+
+/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
+unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ unsigned Mask = 0;
+ // 8 nodes, but we only care about the last 4.
+ for (unsigned i = 7; i >= 4; --i) {
+ int Val = SVOp->getMaskElt(i);
+ if (Val >= 0)
+ Mask |= (Val - 4);
+ if (i != 4)
+ Mask <<= 2;
+ }
+ return Mask;
+}
+
+/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
+unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ unsigned Mask = 0;
+ // 8 nodes, but we only care about the first 4.
+ for (int i = 3; i >= 0; --i) {
+ int Val = SVOp->getMaskElt(i);
+ if (Val >= 0)
+ Mask |= Val;
+ if (i != 0)
+ Mask <<= 2;
+ }
+ return Mask;
+}
+
+/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
+unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ EVT VVT = N->getValueType(0);
+ unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
+ int Val = 0;
+
+ unsigned i, e;
+ for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
+ Val = SVOp->getMaskElt(i);
+ if (Val >= 0)
+ break;
+ }
+ return (Val - i) * EltSize;
+}
+
+/// isZeroNode - Returns true if Elt is a constant zero or a floating point
+/// constant +0.0.
+bool X86::isZeroNode(SDValue Elt) {
+ return ((isa<ConstantSDNode>(Elt) &&
+ cast<ConstantSDNode>(Elt)->getZExtValue() == 0) ||
+ (isa<ConstantFPSDNode>(Elt) &&
+ cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
+}
+
+/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
+/// their permute mask.
+static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
+ SelectionDAG &DAG) {
+ EVT VT = SVOp->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> MaskVec;
+
+ for (unsigned i = 0; i != NumElems; ++i) {
+ int idx = SVOp->getMaskElt(i);
+ if (idx < 0)
+ MaskVec.push_back(idx);
+ else if (idx < (int)NumElems)
+ MaskVec.push_back(idx + NumElems);
+ else
+ MaskVec.push_back(idx - NumElems);
+ }
+ return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
+ SVOp->getOperand(0), &MaskVec[0]);
+}
+
+/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
+/// the two vector operands have swapped position.
+static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
+ unsigned NumElems = VT.getVectorNumElements();
+ for (unsigned i = 0; i != NumElems; ++i) {
+ int idx = Mask[i];
+ if (idx < 0)
+ continue;
+ else if (idx < (int)NumElems)
+ Mask[i] = idx + NumElems;
+ else
+ Mask[i] = idx - NumElems;
+ }
+}
+
+/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
+/// match movhlps. The lower half elements should come from upper half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order).
+static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
+ if (Op->getValueType(0).getVectorNumElements() != 4)
+ return false;
+ for (unsigned i = 0, e = 2; i != e; ++i)
+ if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
+ return false;
+ for (unsigned i = 2; i != 4; ++i)
+ if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
+ return false;
+ return true;
+}
+
+/// isScalarLoadToVector - Returns true if the node is a scalar load that
+/// is promoted to a vector. It also returns the LoadSDNode by reference if
+/// required.
+static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
+ if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
+ return false;
+ N = N->getOperand(0).getNode();
+ if (!ISD::isNON_EXTLoad(N))
+ return false;
+ if (LD)
+ *LD = cast<LoadSDNode>(N);
+ return true;
+}
+
+/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
+/// match movlp{s|d}. The lower half elements should come from lower half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order). And since V1 will become the source of the
+/// MOVLP, it must be either a vector load or a scalar load to vector.
+static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
+ ShuffleVectorSDNode *Op) {
+ if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
+ return false;
+ // Is V2 is a vector load, don't do this transformation. We will try to use
+ // load folding shufps op.
+ if (ISD::isNON_EXTLoad(V2))
+ return false;
+
+ unsigned NumElems = Op->getValueType(0).getVectorNumElements();
+
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+ if (!isUndefOrEqual(Op->getMaskElt(i), i))
+ return false;
+ for (unsigned i = NumElems/2; i != NumElems; ++i)
+ if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
+ return false;
+ return true;
+}
+
+/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
+/// all the same.
+static bool isSplatVector(SDNode *N) {
+ if (N->getOpcode() != ISD::BUILD_VECTOR)
+ return false;
+
+ SDValue SplatValue = N->getOperand(0);
+ for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
+ if (N->getOperand(i) != SplatValue)
+ return false;
+ return true;
+}
+
+/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
+/// to an zero vector.
+/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
+static bool isZeroShuffle(ShuffleVectorSDNode *N) {
+ SDValue V1 = N->getOperand(0);
+ SDValue V2 = N->getOperand(1);
+ unsigned NumElems = N->getValueType(0).getVectorNumElements();
+ for (unsigned i = 0; i != NumElems; ++i) {
+ int Idx = N->getMaskElt(i);
+ if (Idx >= (int)NumElems) {
+ unsigned Opc = V2.getOpcode();
+ if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
+ continue;
+ if (Opc != ISD::BUILD_VECTOR ||
+ !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
+ return false;
+ } else if (Idx >= 0) {
+ unsigned Opc = V1.getOpcode();
+ if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
+ continue;
+ if (Opc != ISD::BUILD_VECTOR ||
+ !X86::isZeroNode(V1.getOperand(Idx)))
+ return false;
+ }
+ }
+ return true;
+}
+
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+///
+static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
+ DebugLoc dl) {
+ assert(VT.isVector() && "Expected a vector type");
+
+ // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
+ // type. This ensures they get CSE'd.
+ SDValue Vec;
+ if (VT.getSizeInBits() == 64) { // MMX
+ SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
+ } else if (HasSSE2) { // SSE2
+ SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+ } else { // SSE1
+ SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
+ }
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
+}
+
+/// getOnesVector - Returns a vector of specified type with all bits set.
+///
+static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+ assert(VT.isVector() && "Expected a vector type");
+
+ // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
+ // type. This ensures they get CSE'd.
+ SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+ SDValue Vec;
+ if (VT.getSizeInBits() == 64) // MMX
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
+ else // SSE
+ Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
+}
+
+
+/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
+/// that point to V2 points to its first element.
+static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+ EVT VT = SVOp->getValueType(0);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ bool Changed = false;
+ SmallVector<int, 8> MaskVec;
+ SVOp->getMask(MaskVec);
+
+ for (unsigned i = 0; i != NumElems; ++i) {
+ if (MaskVec[i] > (int)NumElems) {
+ MaskVec[i] = NumElems;
+ Changed = true;
+ }
+ }
+ if (Changed)
+ return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
+ SVOp->getOperand(1), &MaskVec[0]);
+ return SDValue(SVOp, 0);
+}
+
+/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+ SDValue V2) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> Mask;
+ Mask.push_back(NumElems);
+ for (unsigned i = 1; i != NumElems; ++i)
+ Mask.push_back(i);
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
+static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+ SDValue V2) {
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 8> Mask;
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
+ Mask.push_back(i);
+ Mask.push_back(i + NumElems);
+ }
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
+static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+ SDValue V2) {
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned Half = NumElems/2;
+ SmallVector<int, 8> Mask;
+ for (unsigned i = 0; i != Half; ++i) {
+ Mask.push_back(i + Half);
+ Mask.push_back(i + NumElems + Half);
+ }
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
+static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG,
+ bool HasSSE2) {
+ if (SV->getValueType(0).getVectorNumElements() <= 4)
+ return SDValue(SV, 0);
+
+ EVT PVT = MVT::v4f32;
+ EVT VT = SV->getValueType(0);
+ DebugLoc dl = SV->getDebugLoc();
+ SDValue V1 = SV->getOperand(0);
+ int NumElems = VT.getVectorNumElements();
+ int EltNo = SV->getSplatIndex();
+
+ // unpack elements to the correct location
+ while (NumElems > 4) {
+ if (EltNo < NumElems/2) {
+ V1 = getUnpackl(DAG, dl, VT, V1, V1);
+ } else {
+ V1 = getUnpackh(DAG, dl, VT, V1, V1);
+ EltNo -= NumElems/2;
+ }
+ NumElems >>= 1;
+ }
+
+ // Perform the splat.
+ int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
+ V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
+ V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1);
+}
+
+/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
+/// vector of zero or undef vector. This produces a shuffle where the low
+/// element of V2 is swizzled into the zero/undef vector, landing at element
+/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
+ bool isZero, bool HasSSE2,
+ SelectionDAG &DAG) {
+ EVT VT = V2.getValueType();
+ SDValue V1 = isZero
+ ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+ unsigned NumElems = VT.getVectorNumElements();
+ SmallVector<int, 16> MaskVec;
+ for (unsigned i = 0; i != NumElems; ++i)
+ // If this is the insertion idx, put the low elt of V2 here.
+ MaskVec.push_back(i == Idx ? NumElems : i);
+ return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
+}
+
+/// getNumOfConsecutiveZeros - Return the number of elements in a result of
+/// a shuffle that is zero.
+static
+unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems,
+ bool Low, SelectionDAG &DAG) {
+ unsigned NumZeros = 0;
+ for (int i = 0; i < NumElems; ++i) {
+ unsigned Index = Low ? i : NumElems-i-1;
+ int Idx = SVOp->getMaskElt(Index);
+ if (Idx < 0) {
+ ++NumZeros;
+ continue;
+ }
+ SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index);
+ if (Elt.getNode() && X86::isZeroNode(Elt))
+ ++NumZeros;
+ else
+ break;
+ }
+ return NumZeros;
+}
+
+/// isVectorShift - Returns true if the shuffle can be implemented as a
+/// logical left or right shift of a vector.
+/// FIXME: split into pslldqi, psrldqi, palignr variants.
+static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+ bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+ unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
+
+ isLeft = true;
+ unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG);
+ if (!NumZeros) {
+ isLeft = false;
+ NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG);
+ if (!NumZeros)
+ return false;
+ }
+ bool SeenV1 = false;
+ bool SeenV2 = false;
+ for (unsigned i = NumZeros; i < NumElems; ++i) {
+ unsigned Val = isLeft ? (i - NumZeros) : i;
+ int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros));
+ if (Idx_ < 0)
+ continue;
+ unsigned Idx = (unsigned) Idx_;
+ if (Idx < NumElems)
+ SeenV1 = true;
+ else {
+ Idx -= NumElems;
+ SeenV2 = true;
+ }
+ if (Idx != Val)
+ return false;
+ }
+ if (SeenV1 && SeenV2)
+ return false;
+
+ ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1);
+ ShAmt = NumZeros;
+ return true;
+}
+
+
+/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
+///
+static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG,
+ const TargetLowering &TLI) {
+ if (NumNonZero > 8)
+ return SDValue();
+
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue V(0, 0);
+ bool First = true;
+ for (unsigned i = 0; i < 16; ++i) {
+ bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
+ if (ThisIsNonZero && First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v8i16, true, DAG, dl);
+ else
+ V = DAG.getUNDEF(MVT::v8i16);
+ First = false;
+ }
+
+ if ((i & 1) != 0) {
+ SDValue ThisElt(0, 0), LastElt(0, 0);
+ bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+ if (LastIsNonZero) {
+ LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
+ MVT::i16, Op.getOperand(i-1));
+ }
+ if (ThisIsNonZero) {
+ ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
+ ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
+ ThisElt, DAG.getConstant(8, MVT::i8));
+ if (LastIsNonZero)
+ ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
+ } else
+ ThisElt = LastElt;
+
+ if (ThisElt.getNode())
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+ DAG.getIntPtrConstant(i/2));
+ }
+ }
+
+ return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
+}
+
+/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
+///
+static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG,
+ const TargetLowering &TLI) {
+ if (NumNonZero > 4)
+ return SDValue();
+
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue V(0, 0);
+ bool First = true;
+ for (unsigned i = 0; i < 8; ++i) {
+ bool isNonZero = (NonZeros & (1 << i)) != 0;
+ if (isNonZero) {
+ if (First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v8i16, true, DAG, dl);
+ else
+ V = DAG.getUNDEF(MVT::v8i16);
+ First = false;
+ }
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+ MVT::v8i16, V, Op.getOperand(i),
+ DAG.getIntPtrConstant(i));
+ }
+ }
+
+ return V;
+}
+
+/// getVShift - Return a vector logical shift node.
+///
+static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
+ unsigned NumBits, SelectionDAG &DAG,
+ const TargetLowering &TLI, DebugLoc dl) {
+ bool isMMX = VT.getSizeInBits() == 64;
+ EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
+ unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
+ SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+ DAG.getNode(Opc, dl, ShVT, SrcOp,
+ DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
+}
+
+SDValue
+X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+ SelectionDAG &DAG) const {
+
+ // Check if the scalar load can be widened into a vector load. And if
+ // the address is "base + cst" see if the cst can be "absorbed" into
+ // the shuffle mask.
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
+ SDValue Ptr = LD->getBasePtr();
+ if (!ISD::isNormalLoad(LD) || LD->isVolatile())
+ return SDValue();
+ EVT PVT = LD->getValueType(0);
+ if (PVT != MVT::i32 && PVT != MVT::f32)
+ return SDValue();
+
+ int FI = -1;
+ int64_t Offset = 0;
+ if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
+ FI = FINode->getIndex();
+ Offset = 0;
+ } else if (Ptr.getOpcode() == ISD::ADD &&
+ isa<ConstantSDNode>(Ptr.getOperand(1)) &&
+ isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+ FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+ Offset = Ptr.getConstantOperandVal(1);
+ Ptr = Ptr.getOperand(0);
+ } else {
+ return SDValue();
+ }
+
+ SDValue Chain = LD->getChain();
+ // Make sure the stack object alignment is at least 16.
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ if (DAG.InferPtrAlignment(Ptr) < 16) {
+ if (MFI->isFixedObjectIndex(FI)) {
+ // Can't change the alignment. FIXME: It's possible to compute
+ // the exact stack offset and reference FI + adjust offset instead.
+ // If someone *really* cares about this. That's the way to implement it.
+ return SDValue();
+ } else {
+ MFI->setObjectAlignment(FI, 16);
+ }
+ }
+
+ // (Offset % 16) must be multiple of 4. Then address is then
+ // Ptr + (Offset & ~15).
+ if (Offset < 0)
+ return SDValue();
+ if ((Offset % 16) & 3)
+ return SDValue();
+ int64_t StartOffset = Offset & ~15;
+ if (StartOffset)
+ Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
+ Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
+
+ int EltNo = (Offset - StartOffset) >> 2;
+ int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
+ EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
+ SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0,
+ false, false, 0);
+ // Canonicalize it to a v4i32 shuffle.
+ V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1);
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+ DAG.getVectorShuffle(MVT::v4i32, dl, V1,
+ DAG.getUNDEF(MVT::v4i32), &Mask[0]));
+ }
+
+ return SDValue();
+}
+
+/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
+/// vector of type 'VT', see if the elements can be replaced by a single large
+/// load which has the same value as a build_vector whose operands are 'elts'.
+///
+/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
+///
+/// FIXME: we'd also like to handle the case where the last elements are zero
+/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
+/// There's even a handy isZeroNode for that purpose.
+static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
+ DebugLoc &dl, SelectionDAG &DAG) {
+ EVT EltVT = VT.getVectorElementType();
+ unsigned NumElems = Elts.size();
+
+ LoadSDNode *LDBase = NULL;
+ unsigned LastLoadedElt = -1U;
+
+ // For each element in the initializer, see if we've found a load or an undef.
+ // If we don't find an initial load element, or later load elements are
+ // non-consecutive, bail out.
+ for (unsigned i = 0; i < NumElems; ++i) {
+ SDValue Elt = Elts[i];
+
+ if (!Elt.getNode() ||
+ (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
+ return SDValue();
+ if (!LDBase) {
+ if (Elt.getNode()->getOpcode() == ISD::UNDEF)
+ return SDValue();
+ LDBase = cast<LoadSDNode>(Elt.getNode());
+ LastLoadedElt = i;
+ continue;
+ }
+ if (Elt.getOpcode() == ISD::UNDEF)
+ continue;
+
+ LoadSDNode *LD = cast<LoadSDNode>(Elt);
+ if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+ return SDValue();
+ LastLoadedElt = i;
+ }
+
+ // If we have found an entire vector of loads and undefs, then return a large
+ // load of the entire vector width starting at the base pointer. If we found
+ // consecutive loads for the low half, generate a vzext_load node.
+ if (LastLoadedElt == NumElems - 1) {
+ if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
+ return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
+ LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
+ LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
+ return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
+ LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
+ LDBase->isVolatile(), LDBase->isNonTemporal(),
+ LDBase->getAlignment());
+ } else if (NumElems == 4 && LastLoadedElt == 1) {
+ SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+ SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+ SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
+ }
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+ DebugLoc dl = Op.getDebugLoc();
+ // All zero's are handled with pxor, all one's are handled with pcmpeqd.
+ if (ISD::isBuildVectorAllZeros(Op.getNode())
+ || ISD::isBuildVectorAllOnes(Op.getNode())) {
+ // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
+ // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
+ // eliminated on x86-32 hosts.
+ if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
+ return Op;
+
+ if (ISD::isBuildVectorAllOnes(Op.getNode()))
+ return getOnesVector(Op.getValueType(), DAG, dl);
+ return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
+ }
+
+ EVT VT = Op.getValueType();
+ EVT ExtVT = VT.getVectorElementType();
+ unsigned EVTBits = ExtVT.getSizeInBits();
+
+ unsigned NumElems = Op.getNumOperands();
+ unsigned NumZero = 0;
+ unsigned NumNonZero = 0;
+ unsigned NonZeros = 0;
+ bool IsAllConstants = true;
+ SmallSet<SDValue, 8> Values;
+ for (unsigned i = 0; i < NumElems; ++i) {
+ SDValue Elt = Op.getOperand(i);
+ if (Elt.getOpcode() == ISD::UNDEF)
+ continue;
+ Values.insert(Elt);
+ if (Elt.getOpcode() != ISD::Constant &&
+ Elt.getOpcode() != ISD::ConstantFP)
+ IsAllConstants = false;
+ if (X86::isZeroNode(Elt))
+ NumZero++;
+ else {
+ NonZeros |= (1 << i);
+ NumNonZero++;
+ }
+ }
+
+ if (NumNonZero == 0) {
+ // All undef vector. Return an UNDEF. All zero vectors were handled above.
+ return DAG.getUNDEF(VT);
+ }
+
+ // Special case for single non-zero, non-undef, element.
+ if (NumNonZero == 1) {
+ unsigned Idx = CountTrailingZeros_32(NonZeros);
+ SDValue Item = Op.getOperand(Idx);
+
+ // If this is an insertion of an i64 value on x86-32, and if the top bits of
+ // the value are obviously zero, truncate the value to i32 and do the
+ // insertion that way. Only do this if the value is non-constant or if the
+ // value is a constant being inserted into element 0. It is cheaper to do
+ // a constant pool load than it is to do a movd + shuffle.
+ if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
+ (!IsAllConstants || Idx == 0)) {
+ if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+ // Handle MMX and SSE both.
+ EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
+ unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
+
+ // Truncate the value (which may itself be a constant) to i32, and
+ // convert it to a vector with movd (S2V+shuffle to zero extend).
+ Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true,
+ Subtarget->hasSSE2(), DAG);
+
+ // Now we have our 32-bit value zero extended in the low element of
+ // a vector. If Idx != 0, swizzle it into place.
+ if (Idx != 0) {
+ SmallVector<int, 4> Mask;
+ Mask.push_back(Idx);
+ for (unsigned i = 1; i != VecElts; ++i)
+ Mask.push_back(i);
+ Item = DAG.getVectorShuffle(VecVT, dl, Item,
+ DAG.getUNDEF(Item.getValueType()),
+ &Mask[0]);
+ }
+ return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
+ }
+ }
+
+ // If we have a constant or non-constant insertion into the low element of
+ // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
+ // the rest of the elements. This will be matched as movd/movq/movss/movsd
+ // depending on what the source datatype is.
+ if (Idx == 0) {
+ if (NumZero == 0) {
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+ } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
+ (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+ // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+ return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
+ DAG);
+ } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
+ Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
+ EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32;
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
+ Item = getShuffleVectorZeroOrUndef(Item, 0, true,
+ Subtarget->hasSSE2(), DAG);
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item);
+ }
+ }
+
+ // Is it a vector logical left shift?
+ if (NumElems == 2 && Idx == 1 &&
+ X86::isZeroNode(Op.getOperand(0)) &&
+ !X86::isZeroNode(Op.getOperand(1))) {
+ unsigned NumBits = VT.getSizeInBits();
+ return getVShift(true, VT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ VT, Op.getOperand(1)),
+ NumBits/2, DAG, *this, dl);
+ }
+
+ if (IsAllConstants) // Otherwise, it's better to do a constpool load.
+ return SDValue();
+
+ // Otherwise, if this is a vector with i32 or f32 elements, and the element
+ // is a non-constant being inserted into an element other than the low one,
+ // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
+ // movd/movss) to move this into the low element, then shuffle it into
+ // place.
+ if (EVTBits == 32) {
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+
+ // Turn it into a shuffle of zero and zero-extended scalar to vector.
+ Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
+ Subtarget->hasSSE2(), DAG);
+ SmallVector<int, 8> MaskVec;
+ for (unsigned i = 0; i < NumElems; i++)
+ MaskVec.push_back(i == Idx ? 0 : 1);
+ return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
+ }
+ }
+
+ // Splat is obviously ok. Let legalizer expand it to a shuffle.
+ if (Values.size() == 1) {
+ if (EVTBits == 32) {
+ // Instead of a shuffle like this:
+ // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
+ // Check if it's possible to issue this instead.
+ // shuffle (vload ptr)), undef, <1, 1, 1, 1>
+ unsigned Idx = CountTrailingZeros_32(NonZeros);
+ SDValue Item = Op.getOperand(Idx);
+ if (Op.getNode()->isOnlyUserOf(Item.getNode()))
+ return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
+ }
+ return SDValue();
+ }
+
+ // A vector full of immediates; various special cases are already
+ // handled, so this is best done with a single constant-pool load.
+ if (IsAllConstants)
+ return SDValue();
+
+ // Let legalizer expand 2-wide build_vectors.
+ if (EVTBits == 64) {
+ if (NumNonZero == 1) {
+ // One half is zero or undef.
+ unsigned Idx = CountTrailingZeros_32(NonZeros);
+ SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
+ Op.getOperand(Idx));
+ return getShuffleVectorZeroOrUndef(V2, Idx, true,
+ Subtarget->hasSSE2(), DAG);
+ }
+ return SDValue();
+ }
+
+ // If element VT is < 32 bits, convert it to inserts into a zero vector.
+ if (EVTBits == 8 && NumElems == 16) {
+ SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
+ *this);
+ if (V.getNode()) return V;
+ }
+
+ if (EVTBits == 16 && NumElems == 8) {
+ SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
+ *this);
+ if (V.getNode()) return V;
+ }
+
+ // If element VT is == 32 bits, turn it into a number of shuffles.
+ SmallVector<SDValue, 8> V;
+ V.resize(NumElems);
+ if (NumElems == 4 && NumZero > 0) {
+ for (unsigned i = 0; i < 4; ++i) {
+ bool isZero = !(NonZeros & (1 << i));
+ if (isZero)
+ V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+ else
+ V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+ }
+
+ for (unsigned i = 0; i < 2; ++i) {
+ switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
+ default: break;
+ case 0:
+ V[i] = V[i*2]; // Must be a zero vector.
+ break;
+ case 1:
+ V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
+ break;
+ case 2:
+ V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
+ break;
+ case 3:
+ V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
+ break;
+ }
+ }
+
+ SmallVector<int, 8> MaskVec;
+ bool Reverse = (NonZeros & 0x3) == 2;
+ for (unsigned i = 0; i < 2; ++i)
+ MaskVec.push_back(Reverse ? 1-i : i);
+ Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+ for (unsigned i = 0; i < 2; ++i)
+ MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
+ return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
+ }
+
+ if (Values.size() > 1 && VT.getSizeInBits() == 128) {
+ // Check for a build vector of consecutive loads.
+ for (unsigned i = 0; i < NumElems; ++i)
+ V[i] = Op.getOperand(i);
+
+ // Check for elements which are consecutive loads.
+ SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
+ if (LD.getNode())
+ return LD;
+
+ // For SSE 4.1, use inserts into undef.
+ if (getSubtarget()->hasSSE41()) {
+ V[0] = DAG.getUNDEF(VT);
+ for (unsigned i = 0; i < NumElems; ++i)
+ if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
+ V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0],
+ Op.getOperand(i), DAG.getIntPtrConstant(i));
+ return V[0];
+ }
+
+ // Otherwise, expand into a number of unpckl*
+ // e.g. for v4f32
+ // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
+ // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
+ // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
+ for (unsigned i = 0; i < NumElems; ++i)
+ V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+ NumElems >>= 1;
+ while (NumElems != 0) {
+ for (unsigned i = 0; i < NumElems; ++i)
+ V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]);
+ NumElems >>= 1;
+ }
+ return V[0];
+ }
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+ // We support concatenate two MMX registers and place them in a MMX
+ // register. This is better than doing a stack convert.
+ DebugLoc dl = Op.getDebugLoc();
+ EVT ResVT = Op.getValueType();
+ assert(Op.getNumOperands() == 2);
+ assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
+ ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
+ int Mask[2];
+ SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0));
+ SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
+ InVec = Op.getOperand(1);
+ if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ unsigned NumElts = ResVT.getVectorNumElements();
+ VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
+ VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
+ InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
+ } else {
+ InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec);
+ SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
+ Mask[0] = 0; Mask[1] = 2;
+ VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
+ }
+ return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
+}
+
+// v8i16 shuffles - Prefer shuffles in the following order:
+// 1. [all] pshuflw, pshufhw, optional move
+// 2. [ssse3] 1 x pshufb
+// 3. [ssse3] 2 x pshufb + 1 x por
+// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
+static
+SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
+ SelectionDAG &DAG,
+ const X86TargetLowering &TLI) {
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ DebugLoc dl = SVOp->getDebugLoc();
+ SmallVector<int, 8> MaskVals;
+
+ // Determine if more than 1 of the words in each of the low and high quadwords
+ // of the result come from the same quadword of one of the two inputs. Undef
+ // mask values count as coming from any quadword, for better codegen.
+ SmallVector<unsigned, 4> LoQuad(4);
+ SmallVector<unsigned, 4> HiQuad(4);
+ BitVector InputQuads(4);
+ for (unsigned i = 0; i < 8; ++i) {
+ SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
+ int EltIdx = SVOp->getMaskElt(i);
+ MaskVals.push_back(EltIdx);
+ if (EltIdx < 0) {
+ ++Quad[0];
+ ++Quad[1];
+ ++Quad[2];
+ ++Quad[3];
+ continue;
+ }
+ ++Quad[EltIdx / 4];
+ InputQuads.set(EltIdx / 4);
+ }
+
+ int BestLoQuad = -1;
+ unsigned MaxQuad = 1;
+ for (unsigned i = 0; i < 4; ++i) {
+ if (LoQuad[i] > MaxQuad) {
+ BestLoQuad = i;
+ MaxQuad = LoQuad[i];
+ }
+ }
+
+ int BestHiQuad = -1;
+ MaxQuad = 1;
+ for (unsigned i = 0; i < 4; ++i) {
+ if (HiQuad[i] > MaxQuad) {
+ BestHiQuad = i;
+ MaxQuad = HiQuad[i];
+ }
+ }
+
+ // For SSSE3, If all 8 words of the result come from only 1 quadword of each
+ // of the two input vectors, shuffle them into one input vector so only a
+ // single pshufb instruction is necessary. If There are more than 2 input
+ // quads, disable the next transformation since it does not help SSSE3.
+ bool V1Used = InputQuads[0] || InputQuads[1];
+ bool V2Used = InputQuads[2] || InputQuads[3];
+ if (TLI.getSubtarget()->hasSSSE3()) {
+ if (InputQuads.count() == 2 && V1Used && V2Used) {
+ BestLoQuad = InputQuads.find_first();
+ BestHiQuad = InputQuads.find_next(BestLoQuad);
+ }
+ if (InputQuads.count() > 2) {
+ BestLoQuad = -1;
+ BestHiQuad = -1;
+ }
+ }
+
+ // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
+ // the shuffle mask. If a quad is scored as -1, that means that it contains
+ // words from all 4 input quadwords.
+ SDValue NewV;
+ if (BestLoQuad >= 0 || BestHiQuad >= 0) {
+ SmallVector<int, 8> MaskV;
+ MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
+ MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
+ NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]);
+ NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
+
+ // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
+ // source words for the shuffle, to aid later transformations.
+ bool AllWordsInNewV = true;
+ bool InOrder[2] = { true, true };
+ for (unsigned i = 0; i != 8; ++i) {
+ int idx = MaskVals[i];
+ if (idx != (int)i)
+ InOrder[i/4] = false;
+ if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
+ continue;
+ AllWordsInNewV = false;
+ break;
+ }
+
+ bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
+ if (AllWordsInNewV) {
+ for (int i = 0; i != 8; ++i) {
+ int idx = MaskVals[i];
+ if (idx < 0)
+ continue;
+ idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
+ if ((idx != i) && idx < 4)
+ pshufhw = false;
+ if ((idx != i) && idx > 3)
+ pshuflw = false;
+ }
+ V1 = NewV;
+ V2Used = false;
+ BestLoQuad = 0;
+ BestHiQuad = 1;
+ }
+
+ // If we've eliminated the use of V2, and the new mask is a pshuflw or
+ // pshufhw, that's as cheap as it gets. Return the new shuffle.
+ if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
+ return DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
+ DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
+ }
+ }
+
+ // If we have SSSE3, and all words of the result are from 1 input vector,
+ // case 2 is generated, otherwise case 3 is generated. If no SSSE3
+ // is present, fall back to case 4.
+ if (TLI.getSubtarget()->hasSSSE3()) {
+ SmallVector<SDValue,16> pshufbMask;
+
+ // If we have elements from both input vectors, set the high bit of the
+ // shuffle mask element to zero out elements that come from V2 in the V1
+ // mask, and elements that come from V1 in the V2 mask, so that the two
+ // results can be OR'd together.
+ bool TwoInputs = V1Used && V2Used;
+ for (unsigned i = 0; i != 8; ++i) {
+ int EltIdx = MaskVals[i] * 2;
+ if (TwoInputs && (EltIdx >= 16)) {
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ continue;
+ }
+ pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
+ }
+ V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
+ V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, dl,
+ MVT::v16i8, &pshufbMask[0], 16));
+ if (!TwoInputs)
+ return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+
+ // Calculate the shuffle mask for the second input, shuffle it, and
+ // OR it with the first shuffled input.
+ pshufbMask.clear();
+ for (unsigned i = 0; i != 8; ++i) {
+ int EltIdx = MaskVals[i] * 2;
+ if (EltIdx < 16) {
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ continue;
+ }
+ pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
+ }
+ V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
+ V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
+ DAG.getNode(ISD::BUILD_VECTOR, dl,
+ MVT::v16i8, &pshufbMask[0], 16));
+ V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+ return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+ }
+
+ // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
+ // and update MaskVals with new element order.
+ BitVector InOrder(8);
+ if (BestLoQuad >= 0) {
+ SmallVector<int, 8> MaskV;
+ for (int i = 0; i != 4; ++i) {
+ int idx = MaskVals[i];
+ if (idx < 0) {
+ MaskV.push_back(-1);
+ InOrder.set(i);
+ } else if ((idx / 4) == BestLoQuad) {
+ MaskV.push_back(idx & 3);
+ InOrder.set(i);
+ } else {
+ MaskV.push_back(-1);
+ }
+ }
+ for (unsigned i = 4; i != 8; ++i)
+ MaskV.push_back(i);
+ NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+ &MaskV[0]);
+ }
+
+ // If BestHi >= 0, generate a pshufhw to put the high elements in order,
+ // and update MaskVals with the new element order.
+ if (BestHiQuad >= 0) {
+ SmallVector<int, 8> MaskV;
+ for (unsigned i = 0; i != 4; ++i)
+ MaskV.push_back(i);
+ for (unsigned i = 4; i != 8; ++i) {
+ int idx = MaskVals[i];
+ if (idx < 0) {
+ MaskV.push_back(-1);
+ InOrder.set(i);
+ } else if ((idx / 4) == BestHiQuad) {
+ MaskV.push_back((idx & 3) + 4);
+ InOrder.set(i);
+ } else {
+ MaskV.push_back(-1);
+ }
+ }
+ NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+ &MaskV[0]);
+ }
+
+ // In case BestHi & BestLo were both -1, which means each quadword has a word
+ // from each of the four input quadwords, calculate the InOrder bitvector now
+ // before falling through to the insert/extract cleanup.
+ if (BestLoQuad == -1 && BestHiQuad == -1) {
+ NewV = V1;
+ for (int i = 0; i != 8; ++i)
+ if (MaskVals[i] < 0 || MaskVals[i] == i)
+ InOrder.set(i);
+ }
+
+ // The other elements are put in the right place using pextrw and pinsrw.
+ for (unsigned i = 0; i != 8; ++i) {
+ if (InOrder[i])
+ continue;
+ int EltIdx = MaskVals[i];
+ if (EltIdx < 0)
+ continue;
+ SDValue ExtOp = (EltIdx < 8)
+ ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
+ DAG.getIntPtrConstant(EltIdx))
+ : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
+ DAG.getIntPtrConstant(EltIdx - 8));
+ NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
+ DAG.getIntPtrConstant(i));
+ }
+ return NewV;
+}
+
+// v16i8 shuffles - Prefer shuffles in the following order:
+// 1. [ssse3] 1 x pshufb
+// 2. [ssse3] 2 x pshufb + 1 x por
+// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
+static
+SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
+ SelectionDAG &DAG,
+ const X86TargetLowering &TLI) {
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ DebugLoc dl = SVOp->getDebugLoc();
+ SmallVector<int, 16> MaskVals;
+ SVOp->getMask(MaskVals);
+
+ // If we have SSSE3, case 1 is generated when all result bytes come from
+ // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
+ // present, fall back to case 3.
+ // FIXME: kill V2Only once shuffles are canonizalized by getNode.
+ bool V1Only = true;
+ bool V2Only = true;
+ for (unsigned i = 0; i < 16; ++i) {
+ int EltIdx = MaskVals[i];
+ if (EltIdx < 0)
+ continue;
+ if (EltIdx < 16)
+ V2Only = false;
+ else
+ V1Only = false;
+ }
+
+ // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
+ if (TLI.getSubtarget()->hasSSSE3()) {
+ SmallVector<SDValue,16> pshufbMask;
+
+ // If all result elements are from one input vector, then only translate
+ // undef mask values to 0x80 (zero out result) in the pshufb mask.
+ //
+ // Otherwise, we have elements from both input vectors, and must zero out
+ // elements that come from V2 in the first mask, and V1 in the second mask
+ // so that we can OR them together.
+ bool TwoInputs = !(V1Only || V2Only);
+ for (unsigned i = 0; i != 16; ++i) {
+ int EltIdx = MaskVals[i];
+ if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ continue;
+ }
+ pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+ }
+ // If all the elements are from V2, assign it to V1 and return after
+ // building the first pshufb.
+ if (V2Only)
+ V1 = V2;
+ V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, dl,
+ MVT::v16i8, &pshufbMask[0], 16));
+ if (!TwoInputs)
+ return V1;
+
+ // Calculate the shuffle mask for the second input, shuffle it, and
+ // OR it with the first shuffled input.
+ pshufbMask.clear();
+ for (unsigned i = 0; i != 16; ++i) {
+ int EltIdx = MaskVals[i];
+ if (EltIdx < 16) {
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ continue;
+ }
+ pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
+ }
+ V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
+ DAG.getNode(ISD::BUILD_VECTOR, dl,
+ MVT::v16i8, &pshufbMask[0], 16));
+ return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+ }
+
+ // No SSSE3 - Calculate in place words and then fix all out of place words
+ // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from
+ // the 16 different words that comprise the two doublequadword input vectors.
+ V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+ V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
+ SDValue NewV = V2Only ? V2 : V1;
+ for (int i = 0; i != 8; ++i) {
+ int Elt0 = MaskVals[i*2];
+ int Elt1 = MaskVals[i*2+1];
+
+ // This word of the result is all undef, skip it.
+ if (Elt0 < 0 && Elt1 < 0)
+ continue;
+
+ // This word of the result is already in the correct place, skip it.
+ if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
+ continue;
+ if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
+ continue;
+
+ SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
+ SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
+ SDValue InsElt;
+
+ // If Elt0 and Elt1 are defined, are consecutive, and can be load
+ // using a single extract together, load it and store it.
+ if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
+ InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+ DAG.getIntPtrConstant(Elt1 / 2));
+ NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+ DAG.getIntPtrConstant(i));
+ continue;
+ }
+
+ // If Elt1 is defined, extract it from the appropriate source. If the
+ // source byte is not also odd, shift the extracted word left 8 bits
+ // otherwise clear the bottom 8 bits if we need to do an or.
+ if (Elt1 >= 0) {
+ InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+ DAG.getIntPtrConstant(Elt1 / 2));
+ if ((Elt1 & 1) == 0)
+ InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
+ DAG.getConstant(8, TLI.getShiftAmountTy()));
+ else if (Elt0 >= 0)
+ InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
+ DAG.getConstant(0xFF00, MVT::i16));
+ }
+ // If Elt0 is defined, extract it from the appropriate source. If the
+ // source byte is not also even, shift the extracted word right 8 bits. If
+ // Elt1 was also defined, OR the extracted values together before
+ // inserting them in the result.
+ if (Elt0 >= 0) {
+ SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+ Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
+ if ((Elt0 & 1) != 0)
+ InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
+ DAG.getConstant(8, TLI.getShiftAmountTy()));
+ else if (Elt1 >= 0)
+ InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
+ DAG.getConstant(0x00FF, MVT::i16));
+ InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
+ : InsElt0;
+ }
+ NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+ DAG.getIntPtrConstant(i));
+ }
+ return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
+}
+
+/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
+/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
+/// done when every pair / quad of shuffle mask elements point to elements in
+/// the right sequence. e.g.
+/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
+static
+SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
+ SelectionDAG &DAG,
+ const TargetLowering &TLI, DebugLoc dl) {
+ EVT VT = SVOp->getValueType(0);
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned NewWidth = (NumElems == 4) ? 2 : 4;
+ EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
+ EVT MaskEltVT = MaskVT.getVectorElementType();
+ EVT NewVT = MaskVT;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: assert(false && "Unexpected!");
+ case MVT::v4f32: NewVT = MVT::v2f64; break;
+ case MVT::v4i32: NewVT = MVT::v2i64; break;
+ case MVT::v8i16: NewVT = MVT::v4i32; break;
+ case MVT::v16i8: NewVT = MVT::v4i32; break;
+ }
+
+ if (NewWidth == 2) {
+ if (VT.isInteger())
+ NewVT = MVT::v2i64;
+ else
+ NewVT = MVT::v2f64;
+ }
+ int Scale = NumElems / NewWidth;
+ SmallVector<int, 8> MaskVec;
+ for (unsigned i = 0; i < NumElems; i += Scale) {
+ int StartIdx = -1;
+ for (int j = 0; j < Scale; ++j) {
+ int EltIdx = SVOp->getMaskElt(i+j);
+ if (EltIdx < 0)
+ continue;
+ if (StartIdx == -1)
+ StartIdx = EltIdx - (EltIdx % Scale);
+ if (EltIdx != StartIdx + j)
+ return SDValue();
+ }
+ if (StartIdx == -1)
+ MaskVec.push_back(-1);
+ else
+ MaskVec.push_back(StartIdx / Scale);
+ }
+
+ V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
+ V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
+ return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
+}
+
+/// getVZextMovL - Return a zero-extending vector move low node.
+///
+static SDValue getVZextMovL(EVT VT, EVT OpVT,
+ SDValue SrcOp, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget, DebugLoc dl) {
+ if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+ LoadSDNode *LD = NULL;
+ if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
+ LD = dyn_cast<LoadSDNode>(SrcOp);
+ if (!LD) {
+ // movssrr and movsdrr do not clear top bits. Try to use movd, movq
+ // instead.
+ MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
+ if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) &&
+ SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
+ SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
+ // PR2108
+ OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+ DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ OpVT,
+ SrcOp.getOperand(0)
+ .getOperand(0))));
+ }
+ }
+ }
+
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+ DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+ DAG.getNode(ISD::BIT_CONVERT, dl,
+ OpVT, SrcOp)));
+}
+
+/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
+/// shuffles.
+static SDValue
+LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ DebugLoc dl = SVOp->getDebugLoc();
+ EVT VT = SVOp->getValueType(0);
+
+ SmallVector<std::pair<int, int>, 8> Locs;
+ Locs.resize(4);
+ SmallVector<int, 8> Mask1(4U, -1);
+ SmallVector<int, 8> PermMask;
+ SVOp->getMask(PermMask);
+
+ unsigned NumHi = 0;
+ unsigned NumLo = 0;
+ for (unsigned i = 0; i != 4; ++i) {
+ int Idx = PermMask[i];
+ if (Idx < 0) {
+ Locs[i] = std::make_pair(-1, -1);
+ } else {
+ assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
+ if (Idx < 4) {
+ Locs[i] = std::make_pair(0, NumLo);
+ Mask1[NumLo] = Idx;
+ NumLo++;
+ } else {
+ Locs[i] = std::make_pair(1, NumHi);
+ if (2+NumHi < 4)
+ Mask1[2+NumHi] = Idx;
+ NumHi++;
+ }
+ }
+ }
+
+ if (NumLo <= 2 && NumHi <= 2) {
+ // If no more than two elements come from either vector. This can be
+ // implemented with two shuffles. First shuffle gather the elements.
+ // The second shuffle, which takes the first shuffle as both of its
+ // vector operands, put the elements into the right order.
+ V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+
+ SmallVector<int, 8> Mask2(4U, -1);
+
+ for (unsigned i = 0; i != 4; ++i) {
+ if (Locs[i].first == -1)
+ continue;
+ else {
+ unsigned Idx = (i < 2) ? 0 : 4;
+ Idx += Locs[i].first * 2 + Locs[i].second;
+ Mask2[i] = Idx;
+ }
+ }
+
+ return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
+ } else if (NumLo == 3 || NumHi == 3) {
+ // Otherwise, we must have three elements from one vector, call it X, and
+ // one element from the other, call it Y. First, use a shufps to build an
+ // intermediate vector with the one element from Y and the element from X
+ // that will be in the same half in the final destination (the indexes don't
+ // matter). Then, use a shufps to build the final vector, taking the half
+ // containing the element from Y from the intermediate, and the other half
+ // from X.
+ if (NumHi == 3) {
+ // Normalize it so the 3 elements come from V1.
+ CommuteVectorShuffleMask(PermMask, VT);
+ std::swap(V1, V2);
+ }
+
+ // Find the element from V2.
+ unsigned HiIndex;
+ for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
+ int Val = PermMask[HiIndex];
+ if (Val < 0)
+ continue;
+ if (Val >= 4)
+ break;
+ }
+
+ Mask1[0] = PermMask[HiIndex];
+ Mask1[1] = -1;
+ Mask1[2] = PermMask[HiIndex^1];
+ Mask1[3] = -1;
+ V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+
+ if (HiIndex >= 2) {
+ Mask1[0] = PermMask[0];
+ Mask1[1] = PermMask[1];
+ Mask1[2] = HiIndex & 1 ? 6 : 4;
+ Mask1[3] = HiIndex & 1 ? 4 : 6;
+ return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+ } else {
+ Mask1[0] = HiIndex & 1 ? 2 : 0;
+ Mask1[1] = HiIndex & 1 ? 0 : 2;
+ Mask1[2] = PermMask[2];
+ Mask1[3] = PermMask[3];
+ if (Mask1[2] >= 0)
+ Mask1[2] += 4;
+ if (Mask1[3] >= 0)
+ Mask1[3] += 4;
+ return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
+ }
+ }
+
+ // Break it into (shuffle shuffle_hi, shuffle_lo).
+ Locs.clear();
+ SmallVector<int,8> LoMask(4U, -1);
+ SmallVector<int,8> HiMask(4U, -1);
+
+ SmallVector<int,8> *MaskPtr = &LoMask;
+ unsigned MaskIdx = 0;
+ unsigned LoIdx = 0;
+ unsigned HiIdx = 2;
+ for (unsigned i = 0; i != 4; ++i) {
+ if (i == 2) {
+ MaskPtr = &HiMask;
+ MaskIdx = 1;
+ LoIdx = 0;
+ HiIdx = 2;
+ }
+ int Idx = PermMask[i];
+ if (Idx < 0) {
+ Locs[i] = std::make_pair(-1, -1);
+ } else if (Idx < 4) {
+ Locs[i] = std::make_pair(MaskIdx, LoIdx);
+ (*MaskPtr)[LoIdx] = Idx;
+ LoIdx++;
+ } else {
+ Locs[i] = std::make_pair(MaskIdx, HiIdx);
+ (*MaskPtr)[HiIdx] = Idx;
+ HiIdx++;
+ }
+ }
+
+ SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
+ SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
+ SmallVector<int, 8> MaskOps;
+ for (unsigned i = 0; i != 4; ++i) {
+ if (Locs[i].first == -1) {
+ MaskOps.push_back(-1);
+ } else {
+ unsigned Idx = Locs[i].first * 4 + Locs[i].second;
+ MaskOps.push_back(Idx);
+ }
+ }
+ return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
+}
+
+SDValue
+X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ EVT VT = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc();
+ unsigned NumElems = VT.getVectorNumElements();
+ bool isMMX = VT.getSizeInBits() == 64;
+ bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+ bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+ bool V1IsSplat = false;
+ bool V2IsSplat = false;
+
+ if (isZeroShuffle(SVOp))
+ return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+
+ // Promote splats to v4f32.
+ if (SVOp->isSplat()) {
+ if (isMMX || NumElems < 4)
+ return Op;
+ return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2());
+ }
+
+ // If the shuffle can be profitably rewritten as a narrower shuffle, then
+ // do it!
+ if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+ SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
+ if (NewOp.getNode())
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+ LowerVECTOR_SHUFFLE(NewOp, DAG));
+ } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
+ // FIXME: Figure out a cleaner way to do this.
+ // Try to make use of movq to zero out the top part.
+ if (ISD::isBuildVectorAllZeros(V2.getNode())) {
+ SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
+ if (NewOp.getNode()) {
+ if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
+ return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
+ DAG, Subtarget, dl);
+ }
+ } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+ SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
+ if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
+ return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
+ DAG, Subtarget, dl);
+ }
+ }
+
+ if (X86::isPSHUFDMask(SVOp))
+ return Op;
+
+ // Check if this can be converted into a logical shift.
+ bool isLeft = false;
+ unsigned ShAmt = 0;
+ SDValue ShVal;
+ bool isShift = getSubtarget()->hasSSE2() &&
+ isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+ if (isShift && ShVal.hasOneUse()) {
+ // If the shifted value has multiple uses, it may be cheaper to use
+ // v_set0 + movlhps or movhlps, etc.
+ EVT EltVT = VT.getVectorElementType();
+ ShAmt *= EltVT.getSizeInBits();
+ return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
+ }
+
+ if (X86::isMOVLMask(SVOp)) {
+ if (V1IsUndef)
+ return V2;
+ if (ISD::isBuildVectorAllZeros(V1.getNode()))
+ return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
+ if (!isMMX)
+ return Op;
+ }
+
+ // FIXME: fold these into legal mask.
+ if (!isMMX && (X86::isMOVSHDUPMask(SVOp) ||
+ X86::isMOVSLDUPMask(SVOp) ||
+ X86::isMOVHLPSMask(SVOp) ||
+ X86::isMOVLHPSMask(SVOp) ||
+ X86::isMOVLPMask(SVOp)))
+ return Op;
+
+ if (ShouldXformToMOVHLPS(SVOp) ||
+ ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
+ return CommuteVectorShuffle(SVOp, DAG);
+
+ if (isShift) {
+ // No better options. Use a vshl / vsrl.
+ EVT EltVT = VT.getVectorElementType();
+ ShAmt *= EltVT.getSizeInBits();
+ return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
+ }
+
+ bool Commuted = false;
+ // FIXME: This should also accept a bitcast of a splat? Be careful, not
+ // 1,1,1,1 -> v8i16 though.
+ V1IsSplat = isSplatVector(V1.getNode());
+ V2IsSplat = isSplatVector(V2.getNode());
+
+ // Canonicalize the splat or undef, if present, to be on the RHS.
+ if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
+ Op = CommuteVectorShuffle(SVOp, DAG);
+ SVOp = cast<ShuffleVectorSDNode>(Op);
+ V1 = SVOp->getOperand(0);
+ V2 = SVOp->getOperand(1);
+ std::swap(V1IsSplat, V2IsSplat);
+ std::swap(V1IsUndef, V2IsUndef);
+ Commuted = true;
+ }
+
+ if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
+ // Shuffling low element of v1 into undef, just return v1.
+ if (V2IsUndef)
+ return V1;
+ // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
+ // the instruction selector will not match, so get a canonical MOVL with
+ // swapped operands to undo the commute.
+ return getMOVL(DAG, dl, VT, V2, V1);
+ }
+
+ if (X86::isUNPCKL_v_undef_Mask(SVOp) ||
+ X86::isUNPCKH_v_undef_Mask(SVOp) ||
+ X86::isUNPCKLMask(SVOp) ||
+ X86::isUNPCKHMask(SVOp))
+ return Op;
+
+ if (V2IsSplat) {
+ // Normalize mask so all entries that point to V2 points to its first
+ // element then try to match unpck{h|l} again. If match, return a
+ // new vector_shuffle with the corrected mask.
+ SDValue NewMask = NormalizeMask(SVOp, DAG);
+ ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
+ if (NSVOp != SVOp) {
+ if (X86::isUNPCKLMask(NSVOp, true)) {
+ return NewMask;
+ } else if (X86::isUNPCKHMask(NSVOp, true)) {
+ return NewMask;
+ }
+ }
+ }
+
+ if (Commuted) {
+ // Commute is back and try unpck* again.
+ // FIXME: this seems wrong.
+ SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
+ ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
+ if (X86::isUNPCKL_v_undef_Mask(NewSVOp) ||
+ X86::isUNPCKH_v_undef_Mask(NewSVOp) ||
+ X86::isUNPCKLMask(NewSVOp) ||
+ X86::isUNPCKHMask(NewSVOp))
+ return NewOp;
+ }
+
+ // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.
+
+ // Normalize the node to match x86 shuffle ops if needed
+ if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
+ return CommuteVectorShuffle(SVOp, DAG);
+
+ // Check for legal shuffle and return?
+ SmallVector<int, 16> PermMask;
+ SVOp->getMask(PermMask);
+ if (isShuffleMaskLegal(PermMask, VT))
+ return Op;
+
+ // Handle v8i16 specifically since SSE can do byte extraction and insertion.
+ if (VT == MVT::v8i16) {
+ SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this);
+ if (NewOp.getNode())
+ return NewOp;
+ }
+
+ if (VT == MVT::v16i8) {
+ SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
+ if (NewOp.getNode())
+ return NewOp;
+ }
+
+ // Handle all 4 wide cases with a number of shuffles except for MMX.
+ if (NumElems == 4 && !isMMX)
+ return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
+
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc();
+ if (VT.getSizeInBits() == 8) {
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+ DAG.getValueType(VT));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ } else if (VT.getSizeInBits() == 16) {
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ // If Idx is 0, it's cheaper to do a move instead of a pextrw.
+ if (Idx == 0)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getNode(ISD::BIT_CONVERT, dl,
+ MVT::v4i32,
+ Op.getOperand(0)),
+ Op.getOperand(1)));
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+ DAG.getValueType(VT));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ } else if (VT == MVT::f32) {
+ // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
+ // the result back to FR32 register. It's only worth matching if the
+ // result has a single use which is a store or a bitcast to i32. And in
+ // the case of a store, it's not worth it if the index is a constant 0,
+ // because a MOVSSmr can be used instead, which is smaller and faster.
+ if (!Op.hasOneUse())
+ return SDValue();
+ SDNode *User = *Op.getNode()->use_begin();
+ if ((User->getOpcode() != ISD::STORE ||
+ (isa<ConstantSDNode>(Op.getOperand(1)) &&
+ cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
+ (User->getOpcode() != ISD::BIT_CONVERT ||
+ User->getValueType(0) != MVT::i32))
+ return SDValue();
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
+ Op.getOperand(0)),
+ Op.getOperand(1));
+ return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
+ } else if (VT == MVT::i32) {
+ // ExtractPS works with constant index.
+ if (isa<ConstantSDNode>(Op.getOperand(1)))
+ return Op;
+ }
+ return SDValue();
+}
+
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (!isa<ConstantSDNode>(Op.getOperand(1)))
+ return SDValue();
+
+ if (Subtarget->hasSSE41()) {
+ SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
+ if (Res.getNode())
+ return Res;
+ }
+
+ EVT VT = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc();
+ // TODO: handle v16i8.
+ if (VT.getSizeInBits() == 16) {
+ SDValue Vec = Op.getOperand(0);
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ if (Idx == 0)
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getNode(ISD::BIT_CONVERT, dl,
+ MVT::v4i32, Vec),
+ Op.getOperand(1)));
+ // Transform it so it match pextrw which produces a 32-bit result.
+ EVT EltVT = MVT::i32;
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
+ Op.getOperand(0), Op.getOperand(1));
+ SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
+ DAG.getValueType(VT));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ } else if (VT.getSizeInBits() == 32) {
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ if (Idx == 0)
+ return Op;
+
+ // SHUFPS the element to the lowest double word, then movss.
+ int Mask[4] = { Idx, -1, -1, -1 };
+ EVT VVT = Op.getOperand(0).getValueType();
+ SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+ DAG.getUNDEF(VVT), Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+ DAG.getIntPtrConstant(0));
+ } else if (VT.getSizeInBits() == 64) {
+ // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
+ // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
+ // to match extract_elt for f64.
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ if (Idx == 0)
+ return Op;
+
+ // UNPCKHPD the element to the lowest double word, then movsd.
+ // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+ // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+ int Mask[2] = { 1, -1 };
+ EVT VVT = Op.getOperand(0).getValueType();
+ SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+ DAG.getUNDEF(VVT), Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+ DAG.getIntPtrConstant(0));
+ }
+
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ EVT EltVT = VT.getVectorElementType();
+ DebugLoc dl = Op.getDebugLoc();
+
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ SDValue N2 = Op.getOperand(2);
+
+ if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
+ isa<ConstantSDNode>(N2)) {
+ unsigned Opc;
+ if (VT == MVT::v8i16)
+ Opc = X86ISD::PINSRW;
+ else if (VT == MVT::v4i16)
+ Opc = X86ISD::MMX_PINSRW;
+ else if (VT == MVT::v16i8)
+ Opc = X86ISD::PINSRB;
+ else
+ Opc = X86ISD::PINSRB;
+
+ // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+ // argument.
+ if (N1.getValueType() != MVT::i32)
+ N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+ if (N2.getValueType() != MVT::i32)
+ N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+ return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+ } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
+ // Bits [7:6] of the constant are the source select. This will always be
+ // zero here. The DAG Combiner may combine an extract_elt index into these
+ // bits. For example (insert (extract, 3), 2) could be matched by putting
+ // the '3' into bits [7:6] of X86ISD::INSERTPS.
+ // Bits [5:4] of the constant are the destination select. This is the
+ // value of the incoming immediate.
+ // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
+ // combine either bitwise AND or insert of float 0.0 to set these bits.
+ N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
+ // Create this as a scalar to vector..
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+ return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
+ } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
+ // PINSR* works with constant index.
+ return Op;
+ }
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ EVT EltVT = VT.getVectorElementType();
+
+ if (Subtarget->hasSSE41())
+ return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
+
+ if (EltVT == MVT::i8)
+ return SDValue();
+
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ SDValue N2 = Op.getOperand(2);
+
+ if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
+ // Transform it so it match pinsrw which expects a 16-bit value in a GR32
+ // as its second argument.
+ if (N1.getValueType() != MVT::i32)
+ N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+ if (N2.getValueType() != MVT::i32)
+ N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+ return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW,
+ dl, VT, N0, N1, N2);
+ }
+ return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+ DebugLoc dl = Op.getDebugLoc();
+ if (Op.getValueType() == MVT::v2f32)
+ return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
+ Op.getOperand(0))));
+
+ if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64)
+ return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
+
+ SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
+ EVT VT = MVT::v2i32;
+ switch (Op.getValueType().getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::v16i8:
+ case MVT::v8i16:
+ VT = MVT::v4i32;
+ break;
+ }
+ return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDValue
+X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+ ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ unsigned char OpFlag = 0;
+ unsigned WrapperKind = X86ISD::Wrapper;
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+
+ if (Subtarget->isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ WrapperKind = X86ISD::WrapperRIP;
+ else if (Subtarget->isPICStyleGOT())
+ OpFlag = X86II::MO_GOTOFF;
+ else if (Subtarget->isPICStyleStubPIC())
+ OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+ SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
+ CP->getAlignment(),
+ CP->getOffset(), OpFlag);
+ DebugLoc DL = CP->getDebugLoc();
+ Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+ // With PIC, the address is actually $g + Offset.
+ if (OpFlag) {
+ Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc(), getPointerTy()),
+ Result);
+ }
+
+ return Result;
+}
+
+SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ unsigned char OpFlag = 0;
+ unsigned WrapperKind = X86ISD::Wrapper;
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+
+ if (Subtarget->isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ WrapperKind = X86ISD::WrapperRIP;
+ else if (Subtarget->isPICStyleGOT())
+ OpFlag = X86II::MO_GOTOFF;
+ else if (Subtarget->isPICStyleStubPIC())
+ OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+ SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
+ OpFlag);
+ DebugLoc DL = JT->getDebugLoc();
+ Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (OpFlag) {
+ Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc(), getPointerTy()),
+ Result);
+ }
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
+ const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+
+ // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+ // global base reg.
+ unsigned char OpFlag = 0;
+ unsigned WrapperKind = X86ISD::Wrapper;
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+
+ if (Subtarget->isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ WrapperKind = X86ISD::WrapperRIP;
+ else if (Subtarget->isPICStyleGOT())
+ OpFlag = X86II::MO_GOTOFF;
+ else if (Subtarget->isPICStyleStubPIC())
+ OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+ SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
+
+ DebugLoc DL = Op.getDebugLoc();
+ Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+
+
+ // With PIC, the address is actually $g + Offset.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ !Subtarget->is64Bit()) {
+ Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc(), getPointerTy()),
+ Result);
+ }
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+ // Create the TargetBlockAddressAddress node.
+ unsigned char OpFlags =
+ Subtarget->ClassifyBlockAddressReference();
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+ const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
+ /*isTarget=*/true, OpFlags);
+
+ if (Subtarget->isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+ else
+ Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (isGlobalRelativeToPICBase(OpFlags)) {
+ Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
+ Result);
+ }
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+ int64_t Offset,
+ SelectionDAG &DAG) const {
+ // Create the TargetGlobalAddress node, folding in the constant
+ // offset if it is legal.
+ unsigned char OpFlags =
+ Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+ SDValue Result;
+ if (OpFlags == X86II::MO_NO_FLAG &&
+ X86::isOffsetSuitableForCodeModel(Offset, M)) {
+ // A direct static reference to a global.
+ Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
+ Offset = 0;
+ } else {
+ Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
+ }
+
+ if (Subtarget->isPICStyleRIPRel() &&
+ (M == CodeModel::Small || M == CodeModel::Kernel))
+ Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+ else
+ Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+
+ // With PIC, the address is actually $g + Offset.
+ if (isGlobalRelativeToPICBase(OpFlags)) {
+ Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
+ Result);
+ }
+
+ // For globals that require a load from a stub to get the address, emit the
+ // load.
+ if (isGlobalStubReference(OpFlags))
+ Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
+ PseudoSourceValue::getGOT(), 0, false, false, 0);
+
+ // If there was a non-zero offset that we didn't fold, create an explicit
+ // addition for it.
+ if (Offset != 0)
+ Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
+ DAG.getConstant(Offset, getPointerTy()));
+
+ return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+ const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+ return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
+}
+
+static SDValue
+GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
+ SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
+ unsigned char OperandFlags) {
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ DebugLoc dl = GA->getDebugLoc();
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
+ GA->getValueType(0),
+ GA->getOffset(),
+ OperandFlags);
+ if (InFlag) {
+ SDValue Ops[] = { Chain, TGA, *InFlag };
+ Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
+ } else {
+ SDValue Ops[] = { Chain, TGA };
+ Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
+ }
+
+ // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+ MFI->setAdjustsStack(true);
+
+ SDValue Flag = Chain.getValue(1);
+ return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
+static SDValue
+LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT) {
+ SDValue InFlag;
+ DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better
+ SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc(), PtrVT), InFlag);
+ InFlag = Chain.getValue(1);
+
+ return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
+static SDValue
+LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT) {
+ return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
+ X86::RAX, X86II::MO_TLSGD);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
+// "local exec" model.
+static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const EVT PtrVT, TLSModel::Model model,
+ bool is64Bit) {
+ DebugLoc dl = GA->getDebugLoc();
+ // Get the Thread Pointer
+ SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress,
+ DebugLoc(), PtrVT,
+ DAG.getRegister(is64Bit? X86::FS : X86::GS,
+ MVT::i32));
+
+ SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base,
+ NULL, 0, false, false, 0);
+
+ unsigned char OperandFlags = 0;
+ // Most TLS accesses are not RIP relative, even on x86-64. One exception is
+ // initialexec.
+ unsigned WrapperKind = X86ISD::Wrapper;
+ if (model == TLSModel::LocalExec) {
+ OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
+ } else if (is64Bit) {
+ assert(model == TLSModel::InitialExec);
+ OperandFlags = X86II::MO_GOTTPOFF;
+ WrapperKind = X86ISD::WrapperRIP;
+ } else {
+ assert(model == TLSModel::InitialExec);
+ OperandFlags = X86II::MO_INDNTPOFF;
+ }
+
+ // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
+ // exec)
+ SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
+ GA->getOffset(), OperandFlags);
+ SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+ if (model == TLSModel::InitialExec)
+ Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
+ PseudoSourceValue::getGOT(), 0, false, false, 0);
+
+ // The address of the thread local variable is the add of the thread
+ // pointer with the offset of the variable.
+ return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+ // TODO: implement the "local dynamic" model
+ // TODO: implement the "initial exec"model for pic executables
+ assert(Subtarget->isTargetELF() &&
+ "TLS not implemented for non-ELF targets");
+ GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+ const GlobalValue *GV = GA->getGlobal();
+
+ // If GV is an alias then use the aliasee for determining
+ // thread-localness.
+ if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+ GV = GA->resolveAliasedGlobal(false);
+
+ TLSModel::Model model = getTLSModel(GV,
+ getTargetMachine().getRelocationModel());
+
+ switch (model) {
+ case TLSModel::GeneralDynamic:
+ case TLSModel::LocalDynamic: // not implemented
+ if (Subtarget->is64Bit())
+ return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
+ return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
+
+ case TLSModel::InitialExec:
+ case TLSModel::LocalExec:
+ return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
+ Subtarget->is64Bit());
+ }
+
+ llvm_unreachable("Unreachable");
+ return SDValue();
+}
+
+
+/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
+/// take a 2 x i32 value to shift plus a shift amount.
+SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+ EVT VT = Op.getValueType();
+ unsigned VTBits = VT.getSizeInBits();
+ DebugLoc dl = Op.getDebugLoc();
+ bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+ SDValue ShOpLo = Op.getOperand(0);
+ SDValue ShOpHi = Op.getOperand(1);
+ SDValue ShAmt = Op.getOperand(2);
+ SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
+ DAG.getConstant(VTBits - 1, MVT::i8))
+ : DAG.getConstant(0, VT);
+
+ SDValue Tmp2, Tmp3;
+ if (Op.getOpcode() == ISD::SHL_PARTS) {
+ Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
+ Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+ } else {
+ Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
+ Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
+ }
+
+ SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+ DAG.getConstant(VTBits, MVT::i8));
+ SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+ AndNode, DAG.getConstant(0, MVT::i8));
+
+ SDValue Hi, Lo;
+ SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
+ SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
+
+ if (Op.getOpcode() == ISD::SHL_PARTS) {
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+ } else {
+ Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
+ Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+ }
+
+ SDValue Ops[2] = { Lo, Hi };
+ return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT SrcVT = Op.getOperand(0).getValueType();
+
+ if (SrcVT.isVector()) {
+ if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) {
+ return Op;
+ }
+ return SDValue();
+ }
+
+ assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
+ "Unknown SINT_TO_FP to lower!");
+
+ // These are really Legal; return the operand so the caller accepts it as
+ // Legal.
+ if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
+ return Op;
+ if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+ Subtarget->is64Bit()) {
+ return Op;
+ }
+
+ DebugLoc dl = Op.getDebugLoc();
+ unsigned Size = SrcVT.getSizeInBits()/8;
+ MachineFunction &MF = DAG.getMachineFunction();
+ int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+ SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+ StackSlot,
+ PseudoSourceValue::getFixedStack(SSFI), 0,
+ false, false, 0);
+ return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+}
+
+SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
+ SDValue StackSlot,
+ SelectionDAG &DAG) const {
+ // Build the FILD
+ DebugLoc dl = Op.getDebugLoc();
+ SDVTList Tys;
+ bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
+ if (useSSE)
+ Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
+ else
+ Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+ SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
+ SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl,
+ Tys, Ops, array_lengthof(Ops));
+
+ if (useSSE) {
+ Chain = Result.getValue(1);
+ SDValue InFlag = Result.getValue(2);
+
+ // FIXME: Currently the FST is flagged to the FILD_FLAG. This
+ // shouldn't be necessary except that RFP cannot be live across
+ // multiple blocks. When stackifier is fixed, they can be uncoupled.
+ MachineFunction &MF = DAG.getMachineFunction();
+ int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+ Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {
+ Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
+ };
+ Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops));
+ Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot,
+ PseudoSourceValue::getFixedStack(SSFI), 0,
+ false, false, 0);
+ }
+
+ return Result;
+}
+
+// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
+ SelectionDAG &DAG) const {
+ // This algorithm is not obvious. Here it is in C code, more or less:
+ /*
+ double uint64_to_double( uint32_t hi, uint32_t lo ) {
+ static const __m128i exp = { 0x4330000045300000ULL, 0 };
+ static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
+
+ // Copy ints to xmm registers.
+ __m128i xh = _mm_cvtsi32_si128( hi );
+ __m128i xl = _mm_cvtsi32_si128( lo );
+
+ // Combine into low half of a single xmm register.
+ __m128i x = _mm_unpacklo_epi32( xh, xl );
+ __m128d d;
+ double sd;
+
+ // Merge in appropriate exponents to give the integer bits the right
+ // magnitude.
+ x = _mm_unpacklo_epi32( x, exp );
+
+ // Subtract away the biases to deal with the IEEE-754 double precision
+ // implicit 1.
+ d = _mm_sub_pd( (__m128d) x, bias );
+
+ // All conversions up to here are exact. The correctly rounded result is
+ // calculated using the current rounding mode using the following
+ // horizontal add.
+ d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
+ _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this
+ // store doesn't really need to be here (except
+ // maybe to zero the other double)
+ return sd;
+ }
+ */
+
+ DebugLoc dl = Op.getDebugLoc();
+ LLVMContext *Context = DAG.getContext();
+
+ // Build some magic constants.
+ std::vector<Constant*> CV0;
+ CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
+ CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
+ CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
+ CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
+ Constant *C0 = ConstantVector::get(CV0);
+ SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
+
+ std::vector<Constant*> CV1;
+ CV1.push_back(
+ ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
+ CV1.push_back(
+ ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
+ Constant *C1 = ConstantVector::get(CV1);
+ SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
+
+ SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+ DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Op.getOperand(0),
+ DAG.getIntPtrConstant(1)));
+ SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+ DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Op.getOperand(0),
+ DAG.getIntPtrConstant(0)));
+ SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
+ SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+ PseudoSourceValue::getConstantPool(), 0,
+ false, false, 16);
+ SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
+ SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
+ SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+ PseudoSourceValue::getConstantPool(), 0,
+ false, false, 16);
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+
+ // Add the halves; easiest way is to swap them into another reg first.
+ int ShufMask[2] = { 1, -1 };
+ SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
+ DAG.getUNDEF(MVT::v2f64), ShufMask);
+ SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
+ DAG.getIntPtrConstant(0));
+}
+
+// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
+ SelectionDAG &DAG) const {
+ DebugLoc dl = Op.getDebugLoc();
+ // FP constant to bias correct the final result.
+ SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+ MVT::f64);
+
+ // Load the 32-bit value into an XMM register.
+ SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+ DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Op.getOperand(0),
+ DAG.getIntPtrConstant(0)));
+
+ Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load),
+ DAG.getIntPtrConstant(0));
+
+ // Or the load with the bias.
+ SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ MVT::v2f64, Load)),
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+ MVT::v2f64, Bias)));
+ Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or),
+ DAG.getIntPtrConstant(0));
+
+ // Subtract the bias.
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
+
+ // Handle final rounding.
+ EVT DestVT = Op.getValueType();
+
+ if (DestVT.bitsLT(MVT::f64)) {
+ return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+ DAG.getIntPtrConstant(0));
+ } else if (DestVT.bitsGT(MVT::f64)) {
+ return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+ }
+
+ // Handle final rounding.
+ return Sub;
+}
+
+SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue N0 = Op.getOperand(0);
+ DebugLoc dl = Op.getDebugLoc();
+
+ // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
+ // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+ // the optimization here.
+ if (DAG.SignBitIsZero(N0))
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
+
+ EVT SrcVT = N0.getValueType();
+ EVT DstVT = Op.getValueType();
+ if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
+ return LowerUINT_TO_FP_i64(Op, DAG);
+ else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+ return LowerUINT_TO_FP_i32(Op, DAG);
+
+ // Make a 64-bit buffer, and use it to build an FILD.
+ SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
+ if (SrcVT == MVT::i32) {
+ SDValue WordOff = DAG.getConstant(4, getPointerTy());
+ SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
+ getPointerTy(), StackSlot, WordOff);
+ SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+ StackSlot, NULL, 0, false, false, 0);
+ SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
+ OffsetSlot, NULL, 0, false, false, 0);
+ SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+ return Fild;
+ }
+
+ assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+ StackSlot, NULL, 0, false, false, 0);
+ // For i64 source, we need to add the appropriate power of 2 if the input
+ // was negative. This is the same as the optimization in
+ // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
+ // we must be careful to do the computation in x87 extended precision, not
+ // in SSE. (The generic code can't know it's OK to do this, or how to.)
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+ SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
+ SDValue Fild = DAG.getNode(X86ISD::FILD, dl, Tys, Ops, 3);
+
+ APInt FF(32, 0x5F800000ULL);
+
+ // Check whether the sign bit is set.
+ SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
+ Op.getOperand(0), DAG.getConstant(0, MVT::i64),
+ ISD::SETLT);
+
+ // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
+ SDValue FudgePtr = DAG.getConstantPool(
+ ConstantInt::get(*DAG.getContext(), FF.zext(64)),
+ getPointerTy());
+
+ // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
+ SDValue Zero = DAG.getIntPtrConstant(0);
+ SDValue Four = DAG.getIntPtrConstant(4);
+ SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
+ Zero, Four);
+ FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
+
+ // Load the value out, extending it from f32 to f80.
+ // FIXME: Avoid the extend by constructing the right constant pool?
+ SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
+ FudgePtr, PseudoSourceValue::getConstantPool(),
+ 0, MVT::f32, false, false, 4);
+ // Extend everything to 80 bits to force it to be done on x87.
+ SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
+ return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
+}
+
+std::pair<SDValue,SDValue> X86TargetLowering::
+FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const {
+ DebugLoc dl = Op.getDebugLoc();
+
+ EVT DstTy = Op.getValueType();
+
+ if (!IsSigned) {
+ assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
+ DstTy = MVT::i64;
+ }
+
+ assert(DstTy.getSimpleVT() <= MVT::i64 &&
+ DstTy.getSimpleVT() >= MVT::i16 &&
+ "Unknown FP_TO_SINT to lower!");
+
+ // These are really Legal.
+ if (DstTy == MVT::i32 &&
+ isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+ return std::make_pair(SDValue(), SDValue());
+ if (Subtarget->is64Bit() &&
+ DstTy == MVT::i64 &&
+ isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+ return std::make_pair(SDValue(), SDValue());
+
+ // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
+ // stack slot.
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned MemSize = DstTy.getSizeInBits()/8;
+ int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+ unsigned Opc;
+ switch (DstTy.getSimpleVT().SimpleTy) {
+ default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
+ case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+ case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+ case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+ }
+
+ SDValue Chain = DAG.getEntryNode();
+ SDValue Value = Op.getOperand(0);
+ if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) {
+ assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+ Chain = DAG.getStore(Chain, dl, Value, StackSlot,
+ PseudoSourceValue::getFixedStack(SSFI), 0,
+ false, false, 0);
+ SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
+ SDValue Ops[] = {
+ Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
+ };
+ Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3);
+ Chain = Value.getValue(1);
+ SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+ StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+ }
+
+ // Build the FP_TO_INT*_IN_MEM
+ SDValue Ops[] = { Chain, Value, StackSlot };
+ SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3);
+
+ return std::make_pair(FIST, StackSlot);
+}
+
+SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (Op.getValueType().isVector()) {
+ if (Op.getValueType() == MVT::v2i32 &&
+ Op.getOperand(0).getValueType() == MVT::v2f64) {
+ return Op;
+ }
+ return SDValue();
+ }
+
+ std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
+ SDValue FIST = Vals.first, StackSlot = Vals.second;
+ // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+ if (FIST.getNode() == 0) return Op;
+
+ // Load the result.
+ return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+ FIST, StackSlot, NULL, 0, false, false, 0);
+}
+
+SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
+ SelectionDAG &DAG) const {
+ std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
+ SDValue FIST = Vals.first, StackSlot = Vals.second;
+ assert(FIST.getNode() && "Unexpected failure");
+
+ // Load the result.
+ return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+ FIST, StackSlot, NULL, 0, false, false, 0);
+}
+
+SDValue X86TargetLowering::LowerFABS(SDValue Op,
+ SelectionDAG &DAG) const {
+ LLVMContext *Context = DAG.getContext();
+ DebugLoc dl = Op.getDebugLoc();
+ EVT VT = Op.getValueType();
+ EVT EltVT = VT;
+ if (VT.isVector())
+ EltVT = VT.getVectorElementType();
+ std::vector<Constant*> CV;
+ if (EltVT == MVT::f64) {
+ Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
+ CV.push_back(C);
+ CV.push_back(C);
+ } else {
+ Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
+ CV.push_back(C);
+ CV.push_back(C);
+ CV.push_back(C);
+ CV.push_back(C);
+ }
+ Constant *C = ConstantVector::get(CV);
+ SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+ SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ PseudoSourceValue::getConstantPool(), 0,
+ false, false, 16);
+ return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
+}
+
+SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
+ LLVMContext *Context = DAG.getContext();
+ DebugLoc dl = Op.getDebugLoc();
+ EVT VT = Op.getValueType();
+ EVT EltVT = VT;
+ if (VT.isVector())
+ EltVT = VT.getVectorElementType();
+ std::vector<Constant*> CV;
+ if (EltVT == MVT::f64) {
+ Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
+ CV.push_back(C);
+ CV.push_back(C);
+ } else {
+ Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
+ CV.push_back(C);
+ CV.push_back(C);
+ CV.push_back(C);
+ CV.push_back(C);
+ }
+ Constant *C = ConstantVector::get(CV);
+ SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+ SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ PseudoSourceValue::getConstantPool(), 0,
+ false, false, 16);
+ if (VT.isVector()) {
+ return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+ DAG.getNode(ISD::XOR, dl, MVT::v2i64,
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
+ Op.getOperand(0)),
+ DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask)));
+ } else {
+ return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+ }
+}
+
+SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+ LLVMContext *Context = DAG.getContext();
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ DebugLoc dl = Op.getDebugLoc();
+ EVT VT = Op.getValueType();
+ EVT SrcVT = Op1.getValueType();
+
+ // If second operand is smaller, extend it first.
+ if (SrcVT.bitsLT(VT)) {
+ Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
+ SrcVT = VT;
+ }
+ // And if it is bigger, shrink it first.
+ if (SrcVT.bitsGT(VT)) {
+ Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
+ SrcVT = VT;
+ }
+
+ // At this point the operands and the result should have the same
+ // type, and that won't be f80 since that is not custom lowered.
+
+ // First get the sign bit of second operand.
+ std::vector<Constant*> CV;
+ if (SrcVT == MVT::f64) {
+ CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
+ } else {
+ CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+ }
+ Constant *C = ConstantVector::get(CV);
+ SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+ SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
+ PseudoSourceValue::getConstantPool(), 0,
+ false, false, 16);
+ SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
+
+ // Shift sign bit right or left if the two operands have different types.
+ if (SrcVT.bitsGT(VT)) {
+ // Op0 is MVT::f32, Op1 is MVT::f64.
+ SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
+ SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
+ DAG.getConstant(32, MVT::i32));
+ SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit);
+ SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
+ DAG.getIntPtrConstant(0));
+ }
+
+ // Clear first operand sign bit.
+ CV.clear();
+ if (VT == MVT::f64) {
+ CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
+ } else {
+ CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+ }
+ C = ConstantVector::get(CV);
+ CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+ SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+ PseudoSourceValue::getConstantPool(), 0,
+ false, false, 16);
+ SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
+
+ // Or the value with the sign bit.
+ return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
+}
+
+/// Emit nodes that will be selected as "test Op0,Op0", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
+ SelectionDAG &DAG) const {
+ DebugLoc dl = Op.getDebugLoc();
+
+ // CF and OF aren't always set the way we want. Determine which
+ // of these we need.
+ bool NeedCF = false;
+ bool NeedOF = false;
+ switch (X86CC) {
+ case X86::COND_A: case X86::COND_AE:
+ case X86::COND_B: case X86::COND_BE:
+ NeedCF = true;
+ break;
+ case X86::COND_G: case X86::COND_GE:
+ case X86::COND_L: case X86::COND_LE:
+ case X86::COND_O: case X86::COND_NO:
+ NeedOF = true;
+ break;
+ default: break;
+ }
+
+ // See if we can use the EFLAGS value from the operand instead of
+ // doing a separate TEST. TEST always sets OF and CF to 0, so unless
+ // we prove that the arithmetic won't overflow, we can't use OF or CF.
+ if (Op.getResNo() == 0 && !NeedOF && !NeedCF) {
+ unsigned Opcode = 0;
+ unsigned NumOperands = 0;
+ switch (Op.getNode()->getOpcode()) {
+ case ISD::ADD:
+ // Due to an isel shortcoming, be conservative if this add is
+ // likely to be selected as part of a load-modify-store
+ // instruction. When the root node in a match is a store, isel
+ // doesn't know how to remap non-chain non-flag uses of other
+ // nodes in the match, such as the ADD in this case. This leads
+ // to the ADD being left around and reselected, with the result
+ // being two adds in the output. Alas, even if none our users
+ // are stores, that doesn't prove we're O.K. Ergo, if we have
+ // any parents that aren't CopyToReg or SETCC, eschew INC/DEC.
+ // A better fix seems to require climbing the DAG back to the
+ // root, and it doesn't seem to be worth the effort.
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = Op.getNode()->use_end(); UI != UE; ++UI)
+ if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC)
+ goto default_case;
+ if (ConstantSDNode *C =
+ dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
+ // An add of one will be selected as an INC.
+ if (C->getAPIntValue() == 1) {
+ Opcode = X86ISD::INC;
+ NumOperands = 1;
+ break;
+ }
+ // An add of negative one (subtract of one) will be selected as a DEC.
+ if (C->getAPIntValue().isAllOnesValue()) {
+ Opcode = X86ISD::DEC;
+ NumOperands = 1;
+ break;
+ }
+ }
+ // Otherwise use a regular EFLAGS-setting add.
+ Opcode = X86ISD::ADD;
+ NumOperands = 2;
+ break;
+ case ISD::AND: {
+ // If the primary and result isn't used, don't bother using X86ISD::AND,
+ // because a TEST instruction will be better.
+ bool NonFlagUse = false;
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = Op.getNode()->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ unsigned UOpNo = UI.getOperandNo();
+ if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+ // Look pass truncate.
+ UOpNo = User->use_begin().getOperandNo();
+ User = *User->use_begin();
+ }
+ if (User->getOpcode() != ISD::BRCOND &&
+ User->getOpcode() != ISD::SETCC &&
+ (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
+ NonFlagUse = true;
+ break;
+ }
+ }
+ if (!NonFlagUse)
+ break;
+ }
+ // FALL THROUGH
+ case ISD::SUB:
+ case ISD::OR:
+ case ISD::XOR:
+ // Due to the ISEL shortcoming noted above, be conservative if this op is
+ // likely to be selected as part of a load-modify-store instruction.
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = Op.getNode()->use_end(); UI != UE; ++UI)
+ if (UI->getOpcode() == ISD::STORE)
+ goto default_case;
+ // Otherwise use a regular EFLAGS-setting instruction.
+ switch (Op.getNode()->getOpcode()) {
+ case ISD::SUB: Opcode = X86ISD::SUB; break;
+ case ISD::OR: Opcode = X86ISD::OR; break;
+ case ISD::XOR: Opcode = X86ISD::XOR; break;
+ case ISD::AND: Opcode = X86ISD::AND; break;
+ default: llvm_unreachable("unexpected operator!");
+ }
+ NumOperands = 2;
+ break;
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::INC:
+ case X86ISD::DEC:
+ case X86ISD::OR:
+ case X86ISD::XOR:
+ case X86ISD::AND:
+ return SDValue(Op.getNode(), 1);
+ default:
+ default_case:
+ break;
+ }
+ if (Opcode != 0) {
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SmallVector<SDValue, 4> Ops;
+ for (unsigned i = 0; i != NumOperands; ++i)
+ Ops.push_back(Op.getOperand(i));
+ SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
+ DAG.ReplaceAllUsesWith(Op, New);
+ return SDValue(New.getNode(), 1);
+ }
+ }
+
+ // Otherwise just emit a CMP with 0, which is the TEST pattern.
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+ DAG.getConstant(0, Op.getValueType()));
+}
+
+/// Emit nodes that will be selected as "cmp Op0,Op1", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+ SelectionDAG &DAG) const {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
+ if (C->getAPIntValue() == 0)
+ return EmitTest(Op0, X86CC, DAG);
+
+ DebugLoc dl = Op0.getDebugLoc();
+ return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+}
+
+/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
+/// if it's possible.
+SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
+ DebugLoc dl, SelectionDAG &DAG) const {
+ SDValue Op0 = And.getOperand(0);
+ SDValue Op1 = And.getOperand(1);
+ if (Op0.getOpcode() == ISD::TRUNCATE)
+ Op0 = Op0.getOperand(0);
+ if (Op1.getOpcode() == ISD::TRUNCATE)
+ Op1 = Op1.getOperand(0);
+
+ SDValue LHS, RHS;
+ if (Op1.getOpcode() == ISD::SHL) {
+ if (ConstantSDNode *And10C = dyn_cast<ConstantSDNode>(Op1.getOperand(0)))
+ if (And10C->getZExtValue() == 1) {
+ LHS = Op0;
+ RHS = Op1.getOperand(1);
+ }
+ } else if (Op0.getOpcode() == ISD::SHL) {
+ if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
+ if (And00C->getZExtValue() == 1) {
+ LHS = Op1;
+ RHS = Op0.getOperand(1);
+ }
+ } else if (Op1.getOpcode() == ISD::Constant) {
+ ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
+ SDValue AndLHS = Op0;
+ if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
+ LHS = AndLHS.getOperand(0);
+ RHS = AndLHS.getOperand(1);
+ }
+ }
+
+ if (LHS.getNode()) {
+ // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT
+ // instruction. Since the shift amount is in-range-or-undefined, we know
+ // that doing a bittest on the i32 value is ok. We extend to i32 because
+ // the encoding for the i16 version is larger than the i32 version.
+ // Also promote i16 to i32 for performance / code size reason.
+ if (LHS.getValueType() == MVT::i8 ||
+ LHS.getValueType() == MVT::i16)
+ LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
+
+ // If the operand types disagree, extend the shift amount to match. Since
+ // BT ignores high bits (like shifts) we can use anyextend.
+ if (LHS.getValueType() != RHS.getValueType())
+ RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
+
+ SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
+ unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(Cond, MVT::i8), BT);
+ }
+
+ return SDValue();
+}
+
+SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ DebugLoc dl = Op.getDebugLoc();
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+ // Optimize to BT if possible.
+ // Lower (X & (1 << N)) == 0 to BT(X, N).
+ // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+ // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+ if (Op0.getOpcode() == ISD::AND &&
+ Op0.hasOneUse() &&
+ Op1.getOpcode() == ISD::Constant &&
+ cast<ConstantSDNode>(Op1)->getZExtValue() == 0 &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
+ if (NewSetCC.getNode())
+ return NewSetCC;
+ }
+
+ // Look for "(setcc) == / != 1" to avoid unncessary setcc.
+ if (Op0.getOpcode() == X86ISD::SETCC &&
+ Op1.getOpcode() == ISD::Constant &&
+ (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
+ cast<ConstantSDNode>(Op1)->isNullValue()) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
+ bool Invert = (CC == ISD::SETNE) ^
+ cast<ConstantSDNode>(Op1)->isNullValue();
+ if (Invert)
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
+ }
+
+ bool isFP = Op1.getValueType().isFloatingPoint();
+ unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
+ if (X86CC == X86::COND_INVALID)
+ return SDValue();
+
+ SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG);
+
+ // Use sbb x, x to materialize carry bit into a GPR.
+ if (X86CC == X86::COND_B)
+ return DAG.getNode(ISD::AND, dl, MVT::i8,
+ DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8,
+ DAG.getConstant(X86CC, MVT::i8), Cond),
+ DAG.getConstant(1, MVT::i8));
+
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86CC, MVT::i8), Cond);
+}
+
+SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Cond;
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue CC = Op.getOperand(2);
+ EVT VT = Op.getValueType();
+ ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+ bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
+ DebugLoc dl = Op.getDebugLoc();
+
+ if (isFP) {
+ unsigned SSECC = 8;
+ EVT VT0 = Op0.getValueType();
+ assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
+ unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
+ bool Swap = false;
+
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETOEQ:
+ case ISD::SETEQ: SSECC = 0; break;
+ case ISD::SETOGT:
+ case ISD::SETGT: Swap = true; // Fallthrough
+ case ISD::SETLT:
+ case ISD::SETOLT: SSECC = 1; break;
+ case ISD::SETOGE:
+ case ISD::SETGE: Swap = true; // Fallthrough
+ case ISD::SETLE:
+ case ISD::SETOLE: SSECC = 2; break;
+ case ISD::SETUO: SSECC = 3; break;
+ case ISD::SETUNE:
+ case ISD::SETNE: SSECC = 4; break;
+ case ISD::SETULE: Swap = true;
+ case ISD::SETUGE: SSECC = 5; break;
+ case ISD::SETULT: Swap = true;
+ case ISD::SETUGT: SSECC = 6; break;
+ case ISD::SETO: SSECC = 7; break;
+ }
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ // In the two special cases we can't handle, emit two comparisons.
+ if (SSECC == 8) {
+ if (SetCCOpcode == ISD::SETUEQ) {
+ SDValue UNORD, EQ;
+ UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
+ EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
+ return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
+ }
+ else if (SetCCOpcode == ISD::SETONE) {
+ SDValue ORD, NEQ;
+ ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
+ NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
+ return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
+ }
+ llvm_unreachable("Illegal FP comparison");
+ }
+ // Handle all other FP comparisons here.
+ return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
+ }
+
+ // We are handling one of the integer comparisons here. Since SSE only has
+ // GT and EQ comparisons for integer, swapping operands and multiple
+ // operations may be required for some comparisons.
+ unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
+ bool Swap = false, Invert = false, FlipSigns = false;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::v8i8:
+ case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
+ case MVT::v4i16:
+ case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
+ case MVT::v2i32:
+ case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
+ case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
+ }
+
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETNE: Invert = true;
+ case ISD::SETEQ: Opc = EQOpc; break;
+ case ISD::SETLT: Swap = true;
+ case ISD::SETGT: Opc = GTOpc; break;
+ case ISD::SETGE: Swap = true;
+ case ISD::SETLE: Opc = GTOpc; Invert = true; break;
+ case ISD::SETULT: Swap = true;
+ case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
+ case ISD::SETUGE: Swap = true;
+ case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
+ }
+ if (Swap)
+ std::swap(Op0, Op1);
+
+ // Since SSE has no unsigned integer comparisons, we need to flip the sign
+ // bits of the inputs before performing those operations.
+ if (FlipSigns) {
+ EVT EltVT = VT.getVectorElementType();
+ SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
+ EltVT);
+ std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
+ SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
+ SignBits.size());
+ Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
+ Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
+ }
+
+ SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+
+ // If the logical-not of the result is required, perform that now.
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, VT);
+
+ return Result;
+}
+
+// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
+static bool isX86LogicalCmp(SDValue Op) {
+ unsigned Opc = Op.getNode()->getOpcode();
+ if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
+ return true;
+ if (Op.getResNo() == 1 &&
+ (Opc == X86ISD::ADD ||
+ Opc == X86ISD::SUB ||
+ Opc == X86ISD::SMUL ||
+ Opc == X86ISD::UMUL ||
+ Opc == X86ISD::INC ||
+ Opc == X86ISD::DEC ||
+ Opc == X86ISD::OR ||
+ Opc == X86ISD::XOR ||
+ Opc == X86ISD::AND))
+ return true;
+
+ return false;
+}
+
+SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+ bool addTest = true;
+ SDValue Cond = Op.getOperand(0);
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue CC;
+
+ if (Cond.getOpcode() == ISD::SETCC) {
+ SDValue NewCond = LowerSETCC(Cond, DAG);
+ if (NewCond.getNode())
+ Cond = NewCond;
+ }
+
+ // (select (x == 0), -1, 0) -> (sign_bit (x - 1))
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op2 = Op.getOperand(2);
+ if (Cond.getOpcode() == X86ISD::SETCC &&
+ cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) {
+ SDValue Cmp = Cond.getOperand(1);
+ if (Cmp.getOpcode() == X86ISD::CMP) {
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1);
+ ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
+ ConstantSDNode *RHSC =
+ dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode());
+ if (N1C && N1C->isAllOnesValue() &&
+ N2C && N2C->isNullValue() &&
+ RHSC && RHSC->isNullValue()) {
+ SDValue CmpOp0 = Cmp.getOperand(0);
+ Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+ CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
+ return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(),
+ DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
+ }
+ }
+ }
+
+ // Look pass (and (setcc_carry (cmp ...)), 1).
+ if (Cond.getOpcode() == ISD::AND &&
+ Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+ if (C && C->getAPIntValue() == 1)
+ Cond = Cond.getOperand(0);
+ }
+
+ // If condition flag is set by a X86ISD::CMP, then use it as the condition
+ // setting operand in place of the X86ISD::SETCC.
+ if (Cond.getOpcode() == X86ISD::SETCC ||
+ Cond.getOpcode() == X86ISD::SETCC_CARRY) {
+ CC = Cond.getOperand(0);
+
+ SDValue Cmp = Cond.getOperand(1);
+ unsigned Opc = Cmp.getOpcode();
+ EVT VT = Op.getValueType();
+
+ bool IllegalFPCMov = false;
+ if (VT.isFloatingPoint() && !VT.isVector() &&
+ !isScalarFPTypeInSSEReg(VT)) // FPStack?
+ IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
+
+ if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
+ Opc == X86ISD::BT) { // FIXME
+ Cond = Cmp;
+ addTest = false;
+ }
+ }
+
+ if (addTest) {
+ // Look pass the truncate.
+ if (Cond.getOpcode() == ISD::TRUNCATE)
+ Cond = Cond.getOperand(0);
+
+ // We know the result of AND is compared against zero. Try to match
+ // it to BT.
+ if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+ SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
+ if (NewSetCC.getNode()) {
+ CC = NewSetCC.getOperand(0);
+ Cond = NewSetCC.getOperand(1);
+ addTest = false;
+ }
+ }
+ }
+
+ if (addTest) {
+ CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ Cond = EmitTest(Cond, X86::COND_NE, DAG);
+ }
+
+ // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
+ // condition is true.
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
+ SDValue Ops[] = { Op2, Op1, CC, Cond };
+ return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
+// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
+// from the AND / OR.
+static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
+ Opc = Op.getOpcode();
+ if (Opc != ISD::OR && Opc != ISD::AND)
+ return false;
+ return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(0).hasOneUse() &&
+ Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(1).hasOneUse());
+}
+
+// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
+// 1 and that the SETCC node has a single use.
+static bool isXor1OfSetCC(SDValue Op) {
+ if (Op.getOpcode() != ISD::XOR)
+ return false;
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (N1C && N1C->getAPIntValue() == 1) {
+ return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+ Op.getOperand(0).hasOneUse();
+ }
+ return false;
+}
+
+SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+ bool addTest = true;
+ SDValue Chain = Op.getOperand(0);
+ SDValue Cond = Op.getOperand(1);
+ SDValue Dest = Op.getOperand(2);
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue CC;
+
+ if (Cond.getOpcode() == ISD::SETCC) {
+ SDValue NewCond = LowerSETCC(Cond, DAG);
+ if (NewCond.getNode())
+ Cond = NewCond;
+ }
+#if 0
+ // FIXME: LowerXALUO doesn't handle these!!
+ else if (Cond.getOpcode() == X86ISD::ADD ||
+ Cond.getOpcode() == X86ISD::SUB ||
+ Cond.getOpcode() == X86ISD::SMUL ||
+ Cond.getOpcode() == X86ISD::UMUL)
+ Cond = LowerXALUO(Cond, DAG);
+#endif
+
+ // Look pass (and (setcc_carry (cmp ...)), 1).
+ if (Cond.getOpcode() == ISD::AND &&
+ Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+ if (C && C->getAPIntValue() == 1)
+ Cond = Cond.getOperand(0);
+ }
+
+ // If condition flag is set by a X86ISD::CMP, then use it as the condition
+ // setting operand in place of the X86ISD::SETCC.
+ if (Cond.getOpcode() == X86ISD::SETCC ||
+ Cond.getOpcode() == X86ISD::SETCC_CARRY) {
+ CC = Cond.getOperand(0);
+
+ SDValue Cmp = Cond.getOperand(1);
+ unsigned Opc = Cmp.getOpcode();
+ // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
+ if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
+ Cond = Cmp;
+ addTest = false;
+ } else {
+ switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
+ default: break;
+ case X86::COND_O:
+ case X86::COND_B:
+ // These can only come from an arithmetic instruction with overflow,
+ // e.g. SADDO, UADDO.
+ Cond = Cond.getNode()->getOperand(1);
+ addTest = false;
+ break;
+ }
+ }
+ } else {
+ unsigned CondOpc;
+ if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
+ SDValue Cmp = Cond.getOperand(0).getOperand(1);
+ if (CondOpc == ISD::OR) {
+ // Also, recognize the pattern generated by an FCMP_UNE. We can emit
+ // two branches instead of an explicit OR instruction with a
+ // separate test.
+ if (Cmp == Cond.getOperand(1).getOperand(1) &&
+ isX86LogicalCmp(Cmp)) {
+ CC = Cond.getOperand(0).getOperand(0);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ CC = Cond.getOperand(1).getOperand(0);
+ Cond = Cmp;
+ addTest = false;
+ }
+ } else { // ISD::AND
+ // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
+ // two branches instead of an explicit AND instruction with a
+ // separate test. However, we only do this if this block doesn't
+ // have a fall-through edge, because this requires an explicit
+ // jmp when the condition is false.
+ if (Cmp == Cond.getOperand(1).getOperand(1) &&
+ isX86LogicalCmp(Cmp) &&
+ Op.getNode()->hasOneUse()) {
+ X86::CondCode CCode =
+ (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, MVT::i8);
+ SDValue User = SDValue(*Op.getNode()->use_begin(), 0);
+ // Look for an unconditional branch following this conditional branch.
+ // We need this because we need to reverse the successors in order
+ // to implement FCMP_OEQ.
+ if (User.getOpcode() == ISD::BR) {
+ SDValue FalseBB = User.getOperand(1);
+ SDValue NewBR =
+ DAG.UpdateNodeOperands(User, User.getOperand(0), Dest);
+ assert(NewBR == User);
+ Dest = FalseBB;
+
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cmp);
+ X86::CondCode CCode =
+ (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, MVT::i8);
+ Cond = Cmp;
+ addTest = false;
+ }
+ }
+ }
+ } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
+ // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
+ // It should be transformed during dag combiner except when the condition
+ // is set by a arithmetics with overflow node.
+ X86::CondCode CCode =
+ (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ CC = DAG.getConstant(CCode, MVT::i8);
+ Cond = Cond.getOperand(0).getOperand(1);
+ addTest = false;
+ }
+ }
+
+ if (addTest) {
+ // Look pass the truncate.
+ if (Cond.getOpcode() == ISD::TRUNCATE)
+ Cond = Cond.getOperand(0);
+
+ // We know the result of AND is compared against zero. Try to match
+ // it to BT.
+ if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+ SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
+ if (NewSetCC.getNode()) {
+ CC = NewSetCC.getOperand(0);
+ Cond = NewSetCC.getOperand(1);
+ addTest = false;
+ }
+ }
+ }
+
+ if (addTest) {
+ CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ Cond = EmitTest(Cond, X86::COND_NE, DAG);
+ }
+ return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+ Chain, Dest, CC, Cond);
+}
+
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca is needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDValue
+X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetCygMing() &&
+ "This should be used only on Cygwin/Mingw targets");
+ DebugLoc dl = Op.getDebugLoc();
+
+ // Get the inputs.
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ // FIXME: Ensure alignment here
+
+ SDValue Flag;
+
+ EVT IntPtr = getPointerTy();
+ EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+
+ Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
+ Flag = Chain.getValue(1);
+
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+
+ Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag);
+ Flag = Chain.getValue(1);
+
+ Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
+
+ SDValue Ops1[2] = { Chain.getValue(0), Chain };
+ return DAG.getMergeValues(Ops1, 2, dl);
+}
+
+SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ DebugLoc dl = Op.getDebugLoc();
+
+ if (!Subtarget->is64Bit()) {
+ // vastart just stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+ getPointerTy());
+ return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0,
+ false, false, 0);
+ }
+
+ // __va_list_tag:
+ // gp_offset (0 - 6 * 8)
+ // fp_offset (48 - 48 + 8 * 16)
+ // overflow_arg_area (point to parameters coming in memory).
+ // reg_save_area
+ SmallVector<SDValue, 8> MemOps;
+ SDValue FIN = Op.getOperand(1);
+ // Store gp_offset
+ SDValue Store = DAG.getStore(Op.getOperand(0), dl,
+ DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
+ MVT::i32),
+ FIN, SV, 0, false, false, 0);
+ MemOps.push_back(Store);
+
+ // Store fp_offset
+ FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ FIN, DAG.getIntPtrConstant(4));
+ Store = DAG.getStore(Op.getOperand(0), dl,
+ DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
+ MVT::i32),
+ FIN, SV, 0, false, false, 0);
+ MemOps.push_back(Store);
+
+ // Store ptr to overflow_arg_area
+ FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ FIN, DAG.getIntPtrConstant(4));
+ SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+ getPointerTy());
+ Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0,
+ false, false, 0);
+ MemOps.push_back(Store);
+
+ // Store ptr to reg_save_area.
+ FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ FIN, DAG.getIntPtrConstant(8));
+ SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+ getPointerTy());
+ Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0,
+ false, false, 0);
+ MemOps.push_back(Store);
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ &MemOps[0], MemOps.size());
+}
+
+SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+ // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+ assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
+ SDValue Chain = Op.getOperand(0);
+ SDValue SrcPtr = Op.getOperand(1);
+ SDValue SrcSV = Op.getOperand(2);
+
+ report_fatal_error("VAArgInst is not yet implemented for x86-64!");
+ return SDValue();
+}
+
+SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
+ // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+ assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
+ SDValue Chain = Op.getOperand(0);
+ SDValue DstPtr = Op.getOperand(1);
+ SDValue SrcPtr = Op.getOperand(2);
+ const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+ const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+ DebugLoc dl = Op.getDebugLoc();
+
+ return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr,
+ DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
+ false, DstSV, 0, SrcSV, 0);
+}
+
+SDValue
+X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+ DebugLoc dl = Op.getDebugLoc();
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (IntNo) {
+ default: return SDValue(); // Don't custom lower most intrinsics.
+ // Comparison intrinsics.
+ case Intrinsic::x86_sse_comieq_ss:
+ case Intrinsic::x86_sse_comilt_ss:
+ case Intrinsic::x86_sse_comile_ss:
+ case Intrinsic::x86_sse_comigt_ss:
+ case Intrinsic::x86_sse_comige_ss:
+ case Intrinsic::x86_sse_comineq_ss:
+ case Intrinsic::x86_sse_ucomieq_ss:
+ case Intrinsic::x86_sse_ucomilt_ss:
+ case Intrinsic::x86_sse_ucomile_ss:
+ case Intrinsic::x86_sse_ucomigt_ss:
+ case Intrinsic::x86_sse_ucomige_ss:
+ case Intrinsic::x86_sse_ucomineq_ss:
+ case Intrinsic::x86_sse2_comieq_sd:
+ case Intrinsic::x86_sse2_comilt_sd:
+ case Intrinsic::x86_sse2_comile_sd:
+ case Intrinsic::x86_sse2_comigt_sd:
+ case Intrinsic::x86_sse2_comige_sd:
+ case Intrinsic::x86_sse2_comineq_sd:
+ case Intrinsic::x86_sse2_ucomieq_sd:
+ case Intrinsic::x86_sse2_ucomilt_sd:
+ case Intrinsic::x86_sse2_ucomile_sd:
+ case Intrinsic::x86_sse2_ucomigt_sd:
+ case Intrinsic::x86_sse2_ucomige_sd:
+ case Intrinsic::x86_sse2_ucomineq_sd: {
+ unsigned Opc = 0;
+ ISD::CondCode CC = ISD::SETCC_INVALID;
+ switch (IntNo) {
+ default: break;
+ case Intrinsic::x86_sse_comieq_ss:
+ case Intrinsic::x86_sse2_comieq_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETEQ;
+ break;
+ case Intrinsic::x86_sse_comilt_ss:
+ case Intrinsic::x86_sse2_comilt_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETLT;
+ break;
+ case Intrinsic::x86_sse_comile_ss:
+ case Intrinsic::x86_sse2_comile_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETLE;
+ break;
+ case Intrinsic::x86_sse_comigt_ss:
+ case Intrinsic::x86_sse2_comigt_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETGT;
+ break;
+ case Intrinsic::x86_sse_comige_ss:
+ case Intrinsic::x86_sse2_comige_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETGE;
+ break;
+ case Intrinsic::x86_sse_comineq_ss:
+ case Intrinsic::x86_sse2_comineq_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETNE;
+ break;
+ case Intrinsic::x86_sse_ucomieq_ss:
+ case Intrinsic::x86_sse2_ucomieq_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETEQ;
+ break;
+ case Intrinsic::x86_sse_ucomilt_ss:
+ case Intrinsic::x86_sse2_ucomilt_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETLT;
+ break;
+ case Intrinsic::x86_sse_ucomile_ss:
+ case Intrinsic::x86_sse2_ucomile_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETLE;
+ break;
+ case Intrinsic::x86_sse_ucomigt_ss:
+ case Intrinsic::x86_sse2_ucomigt_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETGT;
+ break;
+ case Intrinsic::x86_sse_ucomige_ss:
+ case Intrinsic::x86_sse2_ucomige_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETGE;
+ break;
+ case Intrinsic::x86_sse_ucomineq_ss:
+ case Intrinsic::x86_sse2_ucomineq_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETNE;
+ break;
+ }
+
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
+ assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
+ SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86CC, MVT::i8), Cond);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+ // ptest intrinsics. The intrinsic these come from are designed to return
+ // an integer value, not just an instruction so lower it to the ptest
+ // pattern and a setcc for the result.
+ case Intrinsic::x86_sse41_ptestz:
+ case Intrinsic::x86_sse41_ptestc:
+ case Intrinsic::x86_sse41_ptestnzc:{
+ unsigned X86CC = 0;
+ switch (IntNo) {
+ default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
+ case Intrinsic::x86_sse41_ptestz:
+ // ZF = 1
+ X86CC = X86::COND_E;
+ break;
+ case Intrinsic::x86_sse41_ptestc:
+ // CF = 1
+ X86CC = X86::COND_B;
+ break;
+ case Intrinsic::x86_sse41_ptestnzc:
+ // ZF and CF = 0
+ X86CC = X86::COND_A;
+ break;
+ }
+
+ SDValue LHS = Op.getOperand(1);
+ SDValue RHS = Op.getOperand(2);
+ SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS);
+ SDValue CC = DAG.getConstant(X86CC, MVT::i8);
+ SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+ return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+ }
+
+ // Fix vector shift instructions where the last operand is a non-immediate
+ // i32 value.
+ case Intrinsic::x86_sse2_pslli_w:
+ case Intrinsic::x86_sse2_pslli_d:
+ case Intrinsic::x86_sse2_pslli_q:
+ case Intrinsic::x86_sse2_psrli_w:
+ case Intrinsic::x86_sse2_psrli_d:
+ case Intrinsic::x86_sse2_psrli_q:
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_mmx_pslli_w:
+ case Intrinsic::x86_mmx_pslli_d:
+ case Intrinsic::x86_mmx_pslli_q:
+ case Intrinsic::x86_mmx_psrli_w:
+ case Intrinsic::x86_mmx_psrli_d:
+ case Intrinsic::x86_mmx_psrli_q:
+ case Intrinsic::x86_mmx_psrai_w:
+ case Intrinsic::x86_mmx_psrai_d: {
+ SDValue ShAmt = Op.getOperand(2);
+ if (isa<ConstantSDNode>(ShAmt))
+ return SDValue();
+
+ unsigned NewIntNo = 0;
+ EVT ShAmtVT = MVT::v4i32;
+ switch (IntNo) {
+ case Intrinsic::x86_sse2_pslli_w:
+ NewIntNo = Intrinsic::x86_sse2_psll_w;
+ break;
+ case Intrinsic::x86_sse2_pslli_d:
+ NewIntNo = Intrinsic::x86_sse2_psll_d;
+ break;
+ case Intrinsic::x86_sse2_pslli_q:
+ NewIntNo = Intrinsic::x86_sse2_psll_q;
+ break;
+ case Intrinsic::x86_sse2_psrli_w:
+ NewIntNo = Intrinsic::x86_sse2_psrl_w;
+ break;
+ case Intrinsic::x86_sse2_psrli_d:
+ NewIntNo = Intrinsic::x86_sse2_psrl_d;
+ break;
+ case Intrinsic::x86_sse2_psrli_q:
+ NewIntNo = Intrinsic::x86_sse2_psrl_q;
+ break;
+ case Intrinsic::x86_sse2_psrai_w:
+ NewIntNo = Intrinsic::x86_sse2_psra_w;
+ break;
+ case Intrinsic::x86_sse2_psrai_d:
+ NewIntNo = Intrinsic::x86_sse2_psra_d;
+ break;
+ default: {
+ ShAmtVT = MVT::v2i32;
+ switch (IntNo) {
+ case Intrinsic::x86_mmx_pslli_w:
+ NewIntNo = Intrinsic::x86_mmx_psll_w;
+ break;
+ case Intrinsic::x86_mmx_pslli_d:
+ NewIntNo = Intrinsic::x86_mmx_psll_d;
+ break;
+ case Intrinsic::x86_mmx_pslli_q:
+ NewIntNo = Intrinsic::x86_mmx_psll_q;
+ break;
+ case Intrinsic::x86_mmx_psrli_w:
+ NewIntNo = Intrinsic::x86_mmx_psrl_w;
+ break;
+ case Intrinsic::x86_mmx_psrli_d:
+ NewIntNo = Intrinsic::x86_mmx_psrl_d;
+ break;
+ case Intrinsic::x86_mmx_psrli_q:
+ NewIntNo = Intrinsic::x86_mmx_psrl_q;
+ break;
+ case Intrinsic::x86_mmx_psrai_w:
+ NewIntNo = Intrinsic::x86_mmx_psra_w;
+ break;
+ case Intrinsic::x86_mmx_psrai_d:
+ NewIntNo = Intrinsic::x86_mmx_psra_d;
+ break;
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ }
+ break;
+ }
+ }
+
+ // The vector shift intrinsics with scalars uses 32b shift amounts but
+ // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
+ // to be zero.
+ SDValue ShOps[4];
+ ShOps[0] = ShAmt;
+ ShOps[1] = DAG.getConstant(0, MVT::i32);
+ if (ShAmtVT == MVT::v4i32) {
+ ShOps[2] = DAG.getUNDEF(MVT::i32);
+ ShOps[3] = DAG.getUNDEF(MVT::i32);
+ ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4);
+ } else {
+ ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
+ }
+
+ EVT VT = Op.getValueType();
+ ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(NewIntNo, MVT::i32),
+ Op.getOperand(1), ShAmt);
+ }
+ }
+}
+
+SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI->setReturnAddressIsTaken(true);
+
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ DebugLoc dl = Op.getDebugLoc();
+
+ if (Depth > 0) {
+ SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+ SDValue Offset =
+ DAG.getConstant(TD->getPointerSize(),
+ Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
+ return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ FrameAddr, Offset),
+ NULL, 0, false, false, 0);
+ }
+
+ // Just load the return address.
+ SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+ return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+ RetAddrFI, NULL, 0, false, false, 0);
+}
+
+SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+ MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+ MFI->setFrameAddressIsTaken(true);
+
+ EVT VT = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful
+ unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
+ SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+ while (Depth--)
+ FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0,
+ false, false, 0);
+ return FrameAddr;
+}
+
+SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
+ SelectionDAG &DAG) const {
+ return DAG.getIntPtrConstant(2*TD->getPointerSize());
+}
+
+SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Offset = Op.getOperand(1);
+ SDValue Handler = Op.getOperand(2);
+ DebugLoc dl = Op.getDebugLoc();
+
+ SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP,
+ getPointerTy());
+ unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
+
+ SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame,
+ DAG.getIntPtrConstant(-TD->getPointerSize()));
+ StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
+ Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0);
+ Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
+ MF.getRegInfo().addLiveOut(StoreAddrReg);
+
+ return DAG.getNode(X86ISD::EH_RETURN, dl,
+ MVT::Other,
+ Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
+}
+
+SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Root = Op.getOperand(0);
+ SDValue Trmp = Op.getOperand(1); // trampoline
+ SDValue FPtr = Op.getOperand(2); // nested function
+ SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+ DebugLoc dl = Op.getDebugLoc();
+
+ const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+ if (Subtarget->is64Bit()) {
+ SDValue OutChains[6];
+
+ // Large code-model.
+ const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
+ const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
+
+ const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
+ const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
+
+ const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
+
+ // Load the pointer to the nested function into R11.
+ unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
+ SDValue Addr = Trmp;
+ OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+ Addr, TrmpAddr, 0, false, false, 0);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(2, MVT::i64));
+ OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2,
+ false, false, 2);
+
+ // Load the 'nest' parameter value into R10.
+ // R10 is specified in X86CallingConv.td
+ OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(10, MVT::i64));
+ OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+ Addr, TrmpAddr, 10, false, false, 0);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(12, MVT::i64));
+ OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12,
+ false, false, 2);
+
+ // Jump to the nested function.
+ OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(20, MVT::i64));
+ OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+ Addr, TrmpAddr, 20, false, false, 0);
+
+ unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+ DAG.getConstant(22, MVT::i64));
+ OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
+ TrmpAddr, 22, false, false, 0);
+
+ SDValue Ops[] =
+ { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
+ return DAG.getMergeValues(Ops, 2, dl);
+ } else {
+ const Function *Func =
+ cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+ CallingConv::ID CC = Func->getCallingConv();
+ unsigned NestReg;
+
+ switch (CC) {
+ default:
+ llvm_unreachable("Unsupported calling convention");
+ case CallingConv::C:
+ case CallingConv::X86_StdCall: {
+ // Pass 'nest' parameter in ECX.
+ // Must be kept in sync with X86CallingConv.td
+ NestReg = X86::ECX;
+
+ // Check that ECX wasn't needed by an 'inreg' parameter.
+ const FunctionType *FTy = Func->getFunctionType();
+ const AttrListPtr &Attrs = Func->getAttributes();
+
+ if (!Attrs.isEmpty() && !Func->isVarArg()) {
+ unsigned InRegCount = 0;
+ unsigned Idx = 1;
+
+ for (FunctionType::param_iterator I = FTy->param_begin(),
+ E = FTy->param_end(); I != E; ++I, ++Idx)
+ if (Attrs.paramHasAttr(Idx, Attribute::InReg))
+ // FIXME: should only count parameters that are lowered to integers.
+ InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
+
+ if (InRegCount > 2) {
+ report_fatal_error("Nest register in use - reduce number of inreg parameters!");
+ }
+ }
+ break;
+ }
+ case CallingConv::X86_FastCall:
+ case CallingConv::X86_ThisCall:
+ case CallingConv::Fast:
+ // Pass 'nest' parameter in EAX.
+ // Must be kept in sync with X86CallingConv.td
+ NestReg = X86::EAX;
+ break;
+ }
+
+ SDValue OutChains[4];
+ SDValue Addr, Disp;
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(10, MVT::i32));
+ Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
+
+ // This is storing the opcode for MOV32ri.
+ const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
+ const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
+ OutChains[0] = DAG.getStore(Root, dl,
+ DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
+ Trmp, TrmpAddr, 0, false, false, 0);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(1, MVT::i32));
+ OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1,
+ false, false, 1);
+
+ const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(5, MVT::i32));
+ OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
+ TrmpAddr, 5, false, false, 1);
+
+ Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+ DAG.getConstant(6, MVT::i32));
+ OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6,
+ false, false, 1);
+
+ SDValue Ops[] =
+ { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
+ return DAG.getMergeValues(Ops, 2, dl);
+ }
+}
+
+SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+ SelectionDAG &DAG) const {
+ /*
+ The rounding mode is in bits 11:10 of FPSR, and has the following
+ settings:
+ 00 Round to nearest
+ 01 Round to -inf
+ 10 Round to +inf
+ 11 Round to 0
+
+ FLT_ROUNDS, on the other hand, expects the following:
+ -1 Undefined
+ 0 Round to 0
+ 1 Round to nearest
+ 2 Round to +inf
+ 3 Round to -inf
+
+ To perform the conversion, we do:
+ (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
+ */
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetMachine &TM = MF.getTarget();
+ const TargetFrameInfo &TFI = *TM.getFrameInfo();
+ unsigned StackAlignment = TFI.getStackAlignment();
+ EVT VT = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc();
+
+ // Save FP Control Word to stack slot
+ int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
+ SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+ SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other,
+ DAG.getEntryNode(), StackSlot);
+
+ // Load FP Control Word from stack slot
+ SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0,
+ false, false, 0);
+
+ // Transform as necessary
+ SDValue CWD1 =
+ DAG.getNode(ISD::SRL, dl, MVT::i16,
+ DAG.getNode(ISD::AND, dl, MVT::i16,
+ CWD, DAG.getConstant(0x800, MVT::i16)),
+ DAG.getConstant(11, MVT::i8));
+ SDValue CWD2 =
+ DAG.getNode(ISD::SRL, dl, MVT::i16,
+ DAG.getNode(ISD::AND, dl, MVT::i16,
+ CWD, DAG.getConstant(0x400, MVT::i16)),
+ DAG.getConstant(9, MVT::i8));
+
+ SDValue RetVal =
+ DAG.getNode(ISD::AND, dl, MVT::i16,
+ DAG.getNode(ISD::ADD, dl, MVT::i16,
+ DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2),
+ DAG.getConstant(1, MVT::i16)),
+ DAG.getConstant(3, MVT::i16));
+
+
+ return DAG.getNode((VT.getSizeInBits() < 16 ?
+ ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
+}
+
+SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ EVT OpVT = VT;
+ unsigned NumBits = VT.getSizeInBits();
+ DebugLoc dl = Op.getDebugLoc();
+
+ Op = Op.getOperand(0);
+ if (VT == MVT::i8) {
+ // Zero extend to i32 since there is not an i8 bsr.
+ OpVT = MVT::i32;
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+ }
+
+ // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+ // If src is zero (i.e. bsr sets ZF), returns NumBits.
+ SDValue Ops[] = {
+ Op,
+ DAG.getConstant(NumBits+NumBits-1, OpVT),
+ DAG.getConstant(X86::COND_E, MVT::i8),
+ Op.getValue(1)
+ };
+ Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
+
+ // Finally xor with NumBits-1.
+ Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
+
+ if (VT == MVT::i8)
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+ return Op;
+}
+
+SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ EVT OpVT = VT;
+ unsigned NumBits = VT.getSizeInBits();
+ DebugLoc dl = Op.getDebugLoc();
+
+ Op = Op.getOperand(0);
+ if (VT == MVT::i8) {
+ OpVT = MVT::i32;
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+ }
+
+ // Issue a bsf (scan bits forward) which also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+ Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
+
+ // If src is zero (i.e. bsf sets ZF), returns NumBits.
+ SDValue Ops[] = {
+ Op,
+ DAG.getConstant(NumBits, OpVT),
+ DAG.getConstant(X86::COND_E, MVT::i8),
+ Op.getValue(1)
+ };
+ Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
+
+ if (VT == MVT::i8)
+ Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+ return Op;
+}
+
+SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
+ DebugLoc dl = Op.getDebugLoc();
+
+ // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
+ // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
+ // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
+ // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
+ // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
+ //
+ // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
+ // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
+ // return AloBlo + AloBhi + AhiBlo;
+
+ SDValue A = Op.getOperand(0);
+ SDValue B = Op.getOperand(1);
+
+ SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+ A, DAG.getConstant(32, MVT::i32));
+ SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+ B, DAG.getConstant(32, MVT::i32));
+ SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
+ A, B);
+ SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
+ A, Bhi);
+ SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
+ Ahi, B);
+ AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+ AloBhi, DAG.getConstant(32, MVT::i32));
+ AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+ AhiBlo, DAG.getConstant(32, MVT::i32));
+ SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
+ Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
+ return Res;
+}
+
+
+SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
+ // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+ // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
+ // looks for this combo and may remove the "setcc" instruction if the "setcc"
+ // has only one use.
+ SDNode *N = Op.getNode();
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ unsigned BaseOp = 0;
+ unsigned Cond = 0;
+ DebugLoc dl = Op.getDebugLoc();
+
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Unknown ovf instruction!");
+ case ISD::SADDO:
+ // A subtract of one will be selected as a INC. Note that INC doesn't
+ // set CF, so we can't do this for UADDO.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+ if (C->getAPIntValue() == 1) {
+ BaseOp = X86ISD::INC;
+ Cond = X86::COND_O;
+ break;
+ }
+ BaseOp = X86ISD::ADD;
+ Cond = X86::COND_O;
+ break;
+ case ISD::UADDO:
+ BaseOp = X86ISD::ADD;
+ Cond = X86::COND_B;
+ break;
+ case ISD::SSUBO:
+ // A subtract of one will be selected as a DEC. Note that DEC doesn't
+ // set CF, so we can't do this for USUBO.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+ if (C->getAPIntValue() == 1) {
+ BaseOp = X86ISD::DEC;
+ Cond = X86::COND_O;
+ break;
+ }
+ BaseOp = X86ISD::SUB;
+ Cond = X86::COND_O;
+ break;
+ case ISD::USUBO:
+ BaseOp = X86ISD::SUB;
+ Cond = X86::COND_B;
+ break;
+ case ISD::SMULO:
+ BaseOp = X86ISD::SMUL;
+ Cond = X86::COND_O;
+ break;
+ case ISD::UMULO:
+ BaseOp = X86ISD::UMUL;
+ Cond = X86::COND_B;
+ break;
+ }
+
+ // Also sets EFLAGS.
+ SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+ SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS);
+
+ SDValue SetCC =
+ DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1),
+ DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1));
+
+ DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
+ return Sum;
+}
+
+SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
+ EVT T = Op.getValueType();
+ DebugLoc dl = Op.getDebugLoc();
+ unsigned Reg = 0;
+ unsigned size = 0;
+ switch(T.getSimpleVT().SimpleTy) {
+ default:
+ assert(false && "Invalid value type!");
+ case MVT::i8: Reg = X86::AL; size = 1; break;
+ case MVT::i16: Reg = X86::AX; size = 2; break;
+ case MVT::i32: Reg = X86::EAX; size = 4; break;
+ case MVT::i64:
+ assert(Subtarget->is64Bit() && "Node not type legal!");
+ Reg = X86::RAX; size = 8;
+ break;
+ }
+ SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg,
+ Op.getOperand(2), SDValue());
+ SDValue Ops[] = { cpIn.getValue(0),
+ Op.getOperand(1),
+ Op.getOperand(3),
+ DAG.getTargetConstant(size, MVT::i8),
+ cpIn.getValue(1) };
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5);
+ SDValue cpOut =
+ DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1));
+ return cpOut;
+}
+
+SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Subtarget->is64Bit() && "Result not type legalized?");
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDValue TheChain = Op.getOperand(0);
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
+ SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
+ SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
+ rax.getValue(2));
+ SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
+ DAG.getConstant(32, MVT::i8));
+ SDValue Ops[] = {
+ DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
+ rdx.getValue(1)
+ };
+ return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT SrcVT = Op.getOperand(0).getValueType();
+ EVT DstVT = Op.getValueType();
+ assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
+ Subtarget->hasMMX() && !DisableMMX) &&
+ "Unexpected custom BIT_CONVERT");
+ assert((DstVT == MVT::i64 ||
+ (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
+ "Unexpected custom BIT_CONVERT");
+ // i64 <=> MMX conversions are Legal.
+ if (SrcVT==MVT::i64 && DstVT.isVector())
+ return Op;
+ if (DstVT==MVT::i64 && SrcVT.isVector())
+ return Op;
+ // MMX <=> MMX conversions are Legal.
+ if (SrcVT.isVector() && DstVT.isVector())
+ return Op;
+ // All other conversions need to be expanded.
+ return SDValue();
+}
+SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
+ SDNode *Node = Op.getNode();
+ DebugLoc dl = Node->getDebugLoc();
+ EVT T = Node->getValueType(0);
+ SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
+ DAG.getConstant(0, T), Node->getOperand(2));
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
+ cast<AtomicSDNode>(Node)->getMemoryVT(),
+ Node->getOperand(0),
+ Node->getOperand(1), negOp,
+ cast<AtomicSDNode>(Node)->getSrcValue(),
+ cast<AtomicSDNode>(Node)->getAlignment());
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+ switch (Op.getOpcode()) {
+ default: llvm_unreachable("Should not custom lower this!");
+ case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG);
+ case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
+ case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
+ case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+ case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
+ case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
+ case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
+ case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+ case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
+ case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
+ case ISD::SHL_PARTS:
+ case ISD::SRA_PARTS:
+ case ISD::SRL_PARTS: return LowerShift(Op, DAG);
+ case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
+ case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+ case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
+ case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+ case ISD::FABS: return LowerFABS(Op, DAG);
+ case ISD::FNEG: return LowerFNEG(Op, DAG);
+ case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
+ case ISD::SETCC: return LowerSETCC(Op, DAG);
+ case ISD::VSETCC: return LowerVSETCC(Op, DAG);
+ case ISD::SELECT: return LowerSELECT(Op, DAG);
+ case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+ case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+ case ISD::VASTART: return LowerVASTART(Op, DAG);
+ case ISD::VAARG: return LowerVAARG(Op, DAG);
+ case ISD::VACOPY: return LowerVACOPY(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
+ case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
+ case ISD::FRAME_TO_ARGS_OFFSET:
+ return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
+ case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG);
+ case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
+ case ISD::CTLZ: return LowerCTLZ(Op, DAG);
+ case ISD::CTTZ: return LowerCTTZ(Op, DAG);
+ case ISD::MUL: return LowerMUL_V2I64(Op, DAG);
+ case ISD::SADDO:
+ case ISD::UADDO:
+ case ISD::SSUBO:
+ case ISD::USUBO:
+ case ISD::SMULO:
+ case ISD::UMULO: return LowerXALUO(Op, DAG);
+ case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG);
+ case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG);
+ }
+}
+
+void X86TargetLowering::
+ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG, unsigned NewOp) const {
+ EVT T = Node->getValueType(0);
+ DebugLoc dl = Node->getDebugLoc();
+ assert (T == MVT::i64 && "Only know how to expand i64 atomics");
+
+ SDValue Chain = Node->getOperand(0);
+ SDValue In1 = Node->getOperand(1);
+ SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Node->getOperand(2), DAG.getIntPtrConstant(0));
+ SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+ Node->getOperand(2), DAG.getIntPtrConstant(1));
+ SDValue Ops[] = { Chain, In1, In2L, In2H };
+ SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+ SDValue Result =
+ DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
+ cast<MemSDNode>(Node)->getMemOperand());
+ SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+ Results.push_back(Result.getValue(2));
+}
+
+/// ReplaceNodeResults - Replace a node with an illegal result type
+/// with a new node built out of custom code.
+void X86TargetLowering::ReplaceNodeResults(SDNode *N,
+ SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const {
+ DebugLoc dl = N->getDebugLoc();
+ switch (N->getOpcode()) {
+ default:
+ assert(false && "Do not know how to custom type legalize this operation!");
+ return;
+ case ISD::FP_TO_SINT: {
+ std::pair<SDValue,SDValue> Vals =
+ FP_TO_INTHelper(SDValue(N, 0), DAG, true);
+ SDValue FIST = Vals.first, StackSlot = Vals.second;
+ if (FIST.getNode() != 0) {
+ EVT VT = N->getValueType(0);
+ // Return a load from the stack slot.
+ Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0,
+ false, false, 0));
+ }
+ return;
+ }
+ case ISD::READCYCLECOUNTER: {
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDValue TheChain = N->getOperand(0);
+ SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
+ SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
+ rd.getValue(1));
+ SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
+ eax.getValue(2));
+ // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+ SDValue Ops[] = { eax, edx };
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
+ Results.push_back(edx.getValue(1));
+ return;
+ }
+ case ISD::ATOMIC_CMP_SWAP: {
+ EVT T = N->getValueType(0);
+ assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
+ SDValue cpInL, cpInH;
+ cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
+ DAG.getConstant(0, MVT::i32));
+ cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
+ DAG.getConstant(1, MVT::i32));
+ cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
+ cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
+ cpInL.getValue(1));
+ SDValue swapInL, swapInH;
+ swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
+ DAG.getConstant(0, MVT::i32));
+ swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
+ DAG.getConstant(1, MVT::i32));
+ swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
+ cpInH.getValue(1));
+ swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
+ swapInL.getValue(1));
+ SDValue Ops[] = { swapInH.getValue(0),
+ N->getOperand(1),
+ swapInH.getValue(1) };
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3);
+ SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
+ MVT::i32, Result.getValue(1));
+ SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
+ MVT::i32, cpOutL.getValue(2));
+ SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+ Results.push_back(cpOutH.getValue(1));
+ return;
+ }
+ case ISD::ATOMIC_LOAD_ADD:
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
+ return;
+ case ISD::ATOMIC_LOAD_AND:
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
+ return;
+ case ISD::ATOMIC_LOAD_NAND:
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
+ return;
+ case ISD::ATOMIC_LOAD_OR:
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
+ return;
+ case ISD::ATOMIC_LOAD_SUB:
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
+ return;
+ case ISD::ATOMIC_LOAD_XOR:
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
+ return;
+ case ISD::ATOMIC_SWAP:
+ ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
+ return;
+ }
+}
+
+const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch (Opcode) {
+ default: return NULL;
+ case X86ISD::BSF: return "X86ISD::BSF";
+ case X86ISD::BSR: return "X86ISD::BSR";
+ case X86ISD::SHLD: return "X86ISD::SHLD";
+ case X86ISD::SHRD: return "X86ISD::SHRD";
+ case X86ISD::FAND: return "X86ISD::FAND";
+ case X86ISD::FOR: return "X86ISD::FOR";
+ case X86ISD::FXOR: return "X86ISD::FXOR";
+ case X86ISD::FSRL: return "X86ISD::FSRL";
+ case X86ISD::FILD: return "X86ISD::FILD";
+ case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
+ case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
+ case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
+ case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+ case X86ISD::FLD: return "X86ISD::FLD";
+ case X86ISD::FST: return "X86ISD::FST";
+ case X86ISD::CALL: return "X86ISD::CALL";
+ case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
+ case X86ISD::BT: return "X86ISD::BT";
+ case X86ISD::CMP: return "X86ISD::CMP";
+ case X86ISD::COMI: return "X86ISD::COMI";
+ case X86ISD::UCOMI: return "X86ISD::UCOMI";
+ case X86ISD::SETCC: return "X86ISD::SETCC";
+ case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
+ case X86ISD::CMOV: return "X86ISD::CMOV";
+ case X86ISD::BRCOND: return "X86ISD::BRCOND";
+ case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
+ case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
+ case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
+ case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
+ case X86ISD::Wrapper: return "X86ISD::Wrapper";
+ case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
+ case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
+ case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
+ case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
+ case X86ISD::PINSRB: return "X86ISD::PINSRB";
+ case X86ISD::PINSRW: return "X86ISD::PINSRW";
+ case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
+ case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
+ case X86ISD::FMAX: return "X86ISD::FMAX";
+ case X86ISD::FMIN: return "X86ISD::FMIN";
+ case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
+ case X86ISD::FRCP: return "X86ISD::FRCP";
+ case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
+ case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress";
+ case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
+ case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
+ case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
+ case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
+ case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
+ case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG";
+ case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG";
+ case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG";
+ case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG";
+ case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG";
+ case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG";
+ case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
+ case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
+ case X86ISD::VSHL: return "X86ISD::VSHL";
+ case X86ISD::VSRL: return "X86ISD::VSRL";
+ case X86ISD::CMPPD: return "X86ISD::CMPPD";
+ case X86ISD::CMPPS: return "X86ISD::CMPPS";
+ case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB";
+ case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW";
+ case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD";
+ case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ";
+ case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB";
+ case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW";
+ case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD";
+ case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ";
+ case X86ISD::ADD: return "X86ISD::ADD";
+ case X86ISD::SUB: return "X86ISD::SUB";
+ case X86ISD::SMUL: return "X86ISD::SMUL";
+ case X86ISD::UMUL: return "X86ISD::UMUL";
+ case X86ISD::INC: return "X86ISD::INC";
+ case X86ISD::DEC: return "X86ISD::DEC";
+ case X86ISD::OR: return "X86ISD::OR";
+ case X86ISD::XOR: return "X86ISD::XOR";
+ case X86ISD::AND: return "X86ISD::AND";
+ case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
+ case X86ISD::PTEST: return "X86ISD::PTEST";
+ case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
+ case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA";
+ }
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
+ const Type *Ty) const {
+ // X86 supports extremely general addressing modes.
+ CodeModel::Model M = getTargetMachine().getCodeModel();
+
+ // X86 allows a sign-extended 32-bit immediate field as a displacement.
+ if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
+ return false;
+
+ if (AM.BaseGV) {
+ unsigned GVFlags =
+ Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
+
+ // If a reference to this global requires an extra load, we can't fold it.
+ if (isGlobalStubReference(GVFlags))
+ return false;
+
+ // If BaseGV requires a register for the PIC base, we cannot also have a
+ // BaseReg specified.
+ if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
+ return false;
+
+ // If lower 4G is not available, then we must use rip-relative addressing.
+ if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
+ return false;
+ }
+
+ switch (AM.Scale) {
+ case 0:
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ // These scales always work.
+ break;
+ case 3:
+ case 5:
+ case 9:
+ // These scales are formed with basereg+scalereg. Only accept if there is
+ // no basereg yet.
+ if (AM.HasBaseReg)
+ return false;
+ break;
+ default: // Other stuff never works.
+ return false;
+ }
+
+ return true;
+}
+
+
+bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
+ if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+ return false;
+ unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+ unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+ if (NumBits1 <= NumBits2)
+ return false;
+ return true;
+}
+
+bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+ if (!VT1.isInteger() || !VT2.isInteger())
+ return false;
+ unsigned NumBits1 = VT1.getSizeInBits();
+ unsigned NumBits2 = VT2.getSizeInBits();
+ if (NumBits1 <= NumBits2)
+ return false;
+ return true;
+}
+
+bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
+ // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+ return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+ // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+ return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
+ // i16 instructions are longer (0x66 prefix) and potentially slower.
+ return !(VT1 == MVT::i32 && VT2 == MVT::i16);
+}
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+ EVT VT) const {
+ // Very little shuffling can be done for 64-bit vectors right now.
+ if (VT.getSizeInBits() == 64)
+ return isPALIGNRMask(M, VT, Subtarget->hasSSSE3());
+
+ // FIXME: pshufb, blends, shifts.
+ return (VT.getVectorNumElements() == 2 ||
+ ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+ isMOVLMask(M, VT) ||
+ isSHUFPMask(M, VT) ||
+ isPSHUFDMask(M, VT) ||
+ isPSHUFHWMask(M, VT) ||
+ isPSHUFLWMask(M, VT) ||
+ isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
+ isUNPCKLMask(M, VT) ||
+ isUNPCKHMask(M, VT) ||
+ isUNPCKL_v_undef_Mask(M, VT) ||
+ isUNPCKH_v_undef_Mask(M, VT));
+}
+
+bool
+X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+ EVT VT) const {
+ unsigned NumElts = VT.getVectorNumElements();
+ // FIXME: This collection of masks seems suspect.
+ if (NumElts == 2)
+ return true;
+ if (NumElts == 4 && VT.getSizeInBits() == 128) {
+ return (isMOVLMask(Mask, VT) ||
+ isCommutedMOVLMask(Mask, VT, true) ||
+ isSHUFPMask(Mask, VT) ||
+ isCommutedSHUFPMask(Mask, VT));
+ }
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+// private utility function
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
+ MachineBasicBlock *MBB,
+ unsigned regOpc,
+ unsigned immOpc,
+ unsigned LoadOpc,
+ unsigned CXchgOpc,
+ unsigned copyOpc,
+ unsigned notOpc,
+ unsigned EAXreg,
+ TargetRegisterClass *RC,
+ bool invSrc) const {
+ // For the atomic bitwise operator, we generate
+ // thisMBB:
+ // newMBB:
+ // ld t1 = [bitinstr.addr]
+ // op t2 = t1, [bitinstr.val]
+ // mov EAX = t1
+ // lcs dest = [bitinstr.addr], t2 [EAX is implicit]
+ // bz newMBB
+ // fallthrough -->nextMBB
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineFunction::iterator MBBIter = MBB;
+ ++MBBIter;
+
+ /// First build the CFG
+ MachineFunction *F = MBB->getParent();
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(MBBIter, newMBB);
+ F->insert(MBBIter, nextMBB);
+
+ // Move all successors to thisMBB to nextMBB
+ nextMBB->transferSuccessors(thisMBB);
+
+ // Update thisMBB to fall through to newMBB
+ thisMBB->addSuccessor(newMBB);
+
+ // newMBB jumps to itself and fall through to nextMBB
+ newMBB->addSuccessor(nextMBB);
+ newMBB->addSuccessor(newMBB);
+
+ // Insert instructions into newMBB based on incoming instruction
+ assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 &&
+ "unexpected number of operands");
+ DebugLoc dl = bInstr->getDebugLoc();
+ MachineOperand& destOper = bInstr->getOperand(0);
+ MachineOperand* argOpers[2 + X86AddrNumOperands];
+ int numArgs = bInstr->getNumOperands() - 1;
+ for (int i=0; i < numArgs; ++i)
+ argOpers[i] = &bInstr->getOperand(i+1);
+
+ // x86 address has 4 operands: base, index, scale, and displacement
+ int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+ int valArgIndx = lastAddrIndx + 1;
+
+ unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
+ MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
+ for (int i=0; i <= lastAddrIndx; ++i)
+ (*MIB).addOperand(*argOpers[i]);
+
+ unsigned tt = F->getRegInfo().createVirtualRegister(RC);
+ if (invSrc) {
+ MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
+ }
+ else
+ tt = t1;
+
+ unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
+ assert((argOpers[valArgIndx]->isReg() ||
+ argOpers[valArgIndx]->isImm()) &&
+ "invalid operand");
+ if (argOpers[valArgIndx]->isReg())
+ MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
+ else
+ MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
+ MIB.addReg(tt);
+ (*MIB).addOperand(*argOpers[valArgIndx]);
+
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg);
+ MIB.addReg(t1);
+
+ MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
+ for (int i=0; i <= lastAddrIndx; ++i)
+ (*MIB).addOperand(*argOpers[i]);
+ MIB.addReg(t2);
+ assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
+ (*MIB).setMemRefs(bInstr->memoperands_begin(),
+ bInstr->memoperands_end());
+
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg());
+ MIB.addReg(EAXreg);
+
+ // insert branch
+ BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
+
+ F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now.
+ return nextMBB;
+}
+
+// private utility function: 64 bit atomics on 32 bit host.
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
+ MachineBasicBlock *MBB,
+ unsigned regOpcL,
+ unsigned regOpcH,
+ unsigned immOpcL,
+ unsigned immOpcH,
+ bool invSrc) const {
+ // For the atomic bitwise operator, we generate
+ // thisMBB (instructions are in pairs, except cmpxchg8b)
+ // ld t1,t2 = [bitinstr.addr]
+ // newMBB:
+ // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
+ // op t5, t6 <- out1, out2, [bitinstr.val]
+ // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val])
+ // mov ECX, EBX <- t5, t6
+ // mov EAX, EDX <- t1, t2
+ // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit]
+ // mov t3, t4 <- EAX, EDX
+ // bz newMBB
+ // result in out1, out2
+ // fallthrough -->nextMBB
+
+ const TargetRegisterClass *RC = X86::GR32RegisterClass;
+ const unsigned LoadOpc = X86::MOV32rm;
+ const unsigned copyOpc = X86::MOV32rr;
+ const unsigned NotOpc = X86::NOT32r;
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineFunction::iterator MBBIter = MBB;
+ ++MBBIter;
+
+ /// First build the CFG
+ MachineFunction *F = MBB->getParent();
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(MBBIter, newMBB);
+ F->insert(MBBIter, nextMBB);
+
+ // Move all successors to thisMBB to nextMBB
+ nextMBB->transferSuccessors(thisMBB);
+
+ // Update thisMBB to fall through to newMBB
+ thisMBB->addSuccessor(newMBB);
+
+ // newMBB jumps to itself and fall through to nextMBB
+ newMBB->addSuccessor(nextMBB);
+ newMBB->addSuccessor(newMBB);
+
+ DebugLoc dl = bInstr->getDebugLoc();
+ // Insert instructions into newMBB based on incoming instruction
+ // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
+ assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 &&
+ "unexpected number of operands");
+ MachineOperand& dest1Oper = bInstr->getOperand(0);
+ MachineOperand& dest2Oper = bInstr->getOperand(1);
+ MachineOperand* argOpers[2 + X86AddrNumOperands];
+ for (int i=0; i < 2 + X86AddrNumOperands; ++i) {
+ argOpers[i] = &bInstr->getOperand(i+2);
+
+ // We use some of the operands multiple times, so conservatively just
+ // clear any kill flags that might be present.
+ if (argOpers[i]->isReg() && argOpers[i]->isUse())
+ argOpers[i]->setIsKill(false);
+ }
+
+ // x86 address has 5 operands: base, index, scale, displacement, and segment.
+ int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+
+ unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
+ MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
+ for (int i=0; i <= lastAddrIndx; ++i)
+ (*MIB).addOperand(*argOpers[i]);
+ unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
+ MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
+ // add 4 to displacement.
+ for (int i=0; i <= lastAddrIndx-2; ++i)
+ (*MIB).addOperand(*argOpers[i]);
+ MachineOperand newOp3 = *(argOpers[3]);
+ if (newOp3.isImm())
+ newOp3.setImm(newOp3.getImm()+4);
+ else
+ newOp3.setOffset(newOp3.getOffset()+4);
+ (*MIB).addOperand(newOp3);
+ (*MIB).addOperand(*argOpers[lastAddrIndx]);
+
+ // t3/4 are defined later, at the bottom of the loop
+ unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
+ unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
+ BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
+ .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
+ BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
+ .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
+
+ // The subsequent operations should be using the destination registers of
+ //the PHI instructions.
+ if (invSrc) {
+ t1 = F->getRegInfo().createVirtualRegister(RC);
+ t2 = F->getRegInfo().createVirtualRegister(RC);
+ MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg());
+ MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg());
+ } else {
+ t1 = dest1Oper.getReg();
+ t2 = dest2Oper.getReg();
+ }
+
+ int valArgIndx = lastAddrIndx + 1;
+ assert((argOpers[valArgIndx]->isReg() ||
+ argOpers[valArgIndx]->isImm()) &&
+ "invalid operand");
+ unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
+ unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
+ if (argOpers[valArgIndx]->isReg())
+ MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
+ else
+ MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
+ if (regOpcL != X86::MOV32rr)
+ MIB.addReg(t1);
+ (*MIB).addOperand(*argOpers[valArgIndx]);
+ assert(argOpers[valArgIndx + 1]->isReg() ==
+ argOpers[valArgIndx]->isReg());
+ assert(argOpers[valArgIndx + 1]->isImm() ==
+ argOpers[valArgIndx]->isImm());
+ if (argOpers[valArgIndx + 1]->isReg())
+ MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
+ else
+ MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
+ if (regOpcH != X86::MOV32rr)
+ MIB.addReg(t2);
+ (*MIB).addOperand(*argOpers[valArgIndx + 1]);
+
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX);
+ MIB.addReg(t1);
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX);
+ MIB.addReg(t2);
+
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX);
+ MIB.addReg(t5);
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX);
+ MIB.addReg(t6);
+
+ MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
+ for (int i=0; i <= lastAddrIndx; ++i)
+ (*MIB).addOperand(*argOpers[i]);
+
+ assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
+ (*MIB).setMemRefs(bInstr->memoperands_begin(),
+ bInstr->memoperands_end());
+
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3);
+ MIB.addReg(X86::EAX);
+ MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4);
+ MIB.addReg(X86::EDX);
+
+ // insert branch
+ BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
+
+ F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now.
+ return nextMBB;
+}
+
+// private utility function
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
+ MachineBasicBlock *MBB,
+ unsigned cmovOpc) const {
+ // For the atomic min/max operator, we generate
+ // thisMBB:
+ // newMBB:
+ // ld t1 = [min/max.addr]
+ // mov t2 = [min/max.val]
+ // cmp t1, t2
+ // cmov[cond] t2 = t1
+ // mov EAX = t1
+ // lcs dest = [bitinstr.addr], t2 [EAX is implicit]
+ // bz newMBB
+ // fallthrough -->nextMBB
+ //
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineFunction::iterator MBBIter = MBB;
+ ++MBBIter;
+
+ /// First build the CFG
+ MachineFunction *F = MBB->getParent();
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(MBBIter, newMBB);
+ F->insert(MBBIter, nextMBB);
+
+ // Move all successors of thisMBB to nextMBB
+ nextMBB->transferSuccessors(thisMBB);
+
+ // Update thisMBB to fall through to newMBB
+ thisMBB->addSuccessor(newMBB);
+
+ // newMBB jumps to newMBB and fall through to nextMBB
+ newMBB->addSuccessor(nextMBB);
+ newMBB->addSuccessor(newMBB);
+
+ DebugLoc dl = mInstr->getDebugLoc();
+ // Insert instructions into newMBB based on incoming instruction
+ assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 &&
+ "unexpected number of operands");
+ MachineOperand& destOper = mInstr->getOperand(0);
+ MachineOperand* argOpers[2 + X86AddrNumOperands];
+ int numArgs = mInstr->getNumOperands() - 1;
+ for (int i=0; i < numArgs; ++i)
+ argOpers[i] = &mInstr->getOperand(i+1);
+
+ // x86 address has 4 operands: base, index, scale, and displacement
+ int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+ int valArgIndx = lastAddrIndx + 1;
+
+ unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+ MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
+ for (int i=0; i <= lastAddrIndx; ++i)
+ (*MIB).addOperand(*argOpers[i]);
+
+ // We only support register and immediate values
+ assert((argOpers[valArgIndx]->isReg() ||
+ argOpers[valArgIndx]->isImm()) &&
+ "invalid operand");
+
+ unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+ if (argOpers[valArgIndx]->isReg())
+ MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
+ else
+ MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
+ (*MIB).addOperand(*argOpers[valArgIndx]);
+
+ MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX);
+ MIB.addReg(t1);
+
+ MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
+ MIB.addReg(t1);
+ MIB.addReg(t2);
+
+ // Generate movc
+ unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+ MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
+ MIB.addReg(t2);
+ MIB.addReg(t1);
+
+ // Cmp and exchange if none has modified the memory location
+ MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
+ for (int i=0; i <= lastAddrIndx; ++i)
+ (*MIB).addOperand(*argOpers[i]);
+ MIB.addReg(t3);
+ assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
+ (*MIB).setMemRefs(mInstr->memoperands_begin(),
+ mInstr->memoperands_end());
+
+ MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg());
+ MIB.addReg(X86::EAX);
+
+ // insert branch
+ BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
+
+ F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now.
+ return nextMBB;
+}
+
+// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
+// all of this code can be replaced with that in the .td file.
+MachineBasicBlock *
+X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
+ unsigned numArgs, bool memArg) const {
+
+ MachineFunction *F = BB->getParent();
+ DebugLoc dl = MI->getDebugLoc();
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+ unsigned Opc;
+ if (memArg)
+ Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
+ else
+ Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
+
+ MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc));
+
+ for (unsigned i = 0; i < numArgs; ++i) {
+ MachineOperand &Op = MI->getOperand(i+1);
+
+ if (!(Op.isReg() && Op.isImplicit()))
+ MIB.addOperand(Op);
+ }
+
+ BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
+ .addReg(X86::XMM0);
+
+ F->DeleteMachineInstr(MI);
+
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
+ MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ // Emit code to save XMM registers to the stack. The ABI says that the
+ // number of registers to save is given in %al, so it's theoretically
+ // possible to do an indirect jump trick to avoid saving all of them,
+ // however this code takes a simpler approach and just executes all
+ // of the stores if %al is non-zero. It's less code, and it's probably
+ // easier on the hardware branch predictor, and stores aren't all that
+ // expensive anyway.
+
+ // Create the new basic blocks. One block contains all the XMM stores,
+ // and one block is the final destination regardless of whether any
+ // stores were performed.
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineFunction *F = MBB->getParent();
+ MachineFunction::iterator MBBIter = MBB;
+ ++MBBIter;
+ MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ F->insert(MBBIter, XMMSaveMBB);
+ F->insert(MBBIter, EndMBB);
+
+ // Set up the CFG.
+ // Move any original successors of MBB to the end block.
+ EndMBB->transferSuccessors(MBB);
+ // The original block will now fall through to the XMM save block.
+ MBB->addSuccessor(XMMSaveMBB);
+ // The XMMSaveMBB will fall through to the end block.
+ XMMSaveMBB->addSuccessor(EndMBB);
+
+ // Now add the instructions.
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+
+ unsigned CountReg = MI->getOperand(0).getReg();
+ int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
+ int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
+
+ if (!Subtarget->isTargetWin64()) {
+ // If %al is 0, branch around the XMM save block.
+ BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
+ BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
+ MBB->addSuccessor(EndMBB);
+ }
+
+ // In the XMM save block, save all the XMM argument registers.
+ for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
+ int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
+ MachineMemOperand *MMO =
+ F->getMachineMemOperand(
+ PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
+ MachineMemOperand::MOStore, Offset,
+ /*Size=*/16, /*Align=*/16);
+ BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
+ .addFrameIndex(RegSaveFrameIndex)
+ .addImm(/*Scale=*/1)
+ .addReg(/*IndexReg=*/0)
+ .addImm(/*Disp=*/Offset)
+ .addReg(/*Segment=*/0)
+ .addReg(MI->getOperand(i).getReg())
+ .addMemOperand(MMO);
+ }
+
+ F->DeleteMachineInstr(MI); // The pseudo instruction is gone now.
+
+ return EndMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+
+ // To "insert" a SELECT_CC instruction, we actually have to insert the
+ // diamond control-flow pattern. The incoming instruction knows the
+ // destination vreg to set, the condition code register to branch on, the
+ // true/false values to select between, and a branch opcode to use.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ MachineFunction::iterator It = BB;
+ ++It;
+
+ // thisMBB:
+ // ...
+ // TrueVal = ...
+ // cmpTY ccX, r1, r2
+ // bCC copy1MBB
+ // fallthrough --> copy0MBB
+ MachineBasicBlock *thisMBB = BB;
+ MachineFunction *F = BB->getParent();
+ MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ unsigned Opc =
+ X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+ BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+ F->insert(It, copy0MBB);
+ F->insert(It, sinkMBB);
+ // Update machine-CFG edges by first adding all successors of the current
+ // block to the new block which will contain the Phi node for the select.
+ for (MachineBasicBlock::succ_iterator I = BB->succ_begin(),
+ E = BB->succ_end(); I != E; ++I)
+ sinkMBB->addSuccessor(*I);
+ // Next, remove all successors of the current block, and add the true
+ // and fallthrough blocks as its successors.
+ while (!BB->succ_empty())
+ BB->removeSuccessor(BB->succ_begin());
+ // Add the true and fallthrough blocks as its successors.
+ BB->addSuccessor(copy0MBB);
+ BB->addSuccessor(sinkMBB);
+
+ // copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to sinkMBB
+ copy0MBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+ // ...
+ BuildMI(sinkMBB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+ .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+ F->DeleteMachineInstr(MI); // The pseudo instruction is gone now.
+ return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+ MachineFunction *F = BB->getParent();
+
+ // The lowering is pretty easy: we're just emitting the call to _alloca. The
+ // non-trivial part is impdef of ESP.
+ // FIXME: The code should be tweaked as soon as we'll try to do codegen for
+ // mingw-w64.
+
+ BuildMI(BB, DL, TII->get(X86::CALLpcrel32))
+ .addExternalSymbol("_alloca")
+ .addReg(X86::EAX, RegState::Implicit)
+ .addReg(X86::ESP, RegState::Implicit)
+ .addReg(X86::EAX, RegState::Define | RegState::Implicit)
+ .addReg(X86::ESP, RegState::Define | RegState::Implicit);
+
+ F->DeleteMachineInstr(MI); // The pseudo instruction is gone now.
+ return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock *BB) const {
+ switch (MI->getOpcode()) {
+ default: assert(false && "Unexpected instr type to insert");
+ case X86::MINGW_ALLOCA:
+ return EmitLoweredMingwAlloca(MI, BB);
+ case X86::CMOV_GR8:
+ case X86::CMOV_V1I64:
+ case X86::CMOV_FR32:
+ case X86::CMOV_FR64:
+ case X86::CMOV_V4F32:
+ case X86::CMOV_V2F64:
+ case X86::CMOV_V2I64:
+ case X86::CMOV_GR16:
+ case X86::CMOV_GR32:
+ case X86::CMOV_RFP32:
+ case X86::CMOV_RFP64:
+ case X86::CMOV_RFP80:
+ return EmitLoweredSelect(MI, BB);
+
+ case X86::FP32_TO_INT16_IN_MEM:
+ case X86::FP32_TO_INT32_IN_MEM:
+ case X86::FP32_TO_INT64_IN_MEM:
+ case X86::FP64_TO_INT16_IN_MEM:
+ case X86::FP64_TO_INT32_IN_MEM:
+ case X86::FP64_TO_INT64_IN_MEM:
+ case X86::FP80_TO_INT16_IN_MEM:
+ case X86::FP80_TO_INT32_IN_MEM:
+ case X86::FP80_TO_INT64_IN_MEM: {
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ DebugLoc DL = MI->getDebugLoc();
+
+ // Change the floating point control register to use "round towards zero"
+ // mode when truncating to an integer value.
+ MachineFunction *F = BB->getParent();
+ int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
+ addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+ // Load the old value of the high byte of the control word...
+ unsigned OldCW =
+ F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
+ addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW),
+ CWFrameIdx);
+
+ // Set the high part to be round to zero...
+ addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
+ .addImm(0xC7F);
+
+ // Reload the modified control word now...
+ addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+ // Restore the memory image of control word to original value
+ addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
+ .addReg(OldCW);
+
+ // Get the X86 opcode to use.
+ unsigned Opc;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+ case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+ case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+ case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+ case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+ case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+ case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
+ case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
+ case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
+ }
+
+ X86AddressMode AM;
+ MachineOperand &Op = MI->getOperand(0);
+ if (Op.isReg()) {
+ AM.BaseType = X86AddressMode::RegBase;
+ AM.Base.Reg = Op.getReg();
+ } else {
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = Op.getIndex();
+ }
+ Op = MI->getOperand(1);
+ if (Op.isImm())
+ AM.Scale = Op.getImm();
+ Op = MI->getOperand(2);
+ if (Op.isImm())
+ AM.IndexReg = Op.getImm();
+ Op = MI->getOperand(3);
+ if (Op.isGlobal()) {
+ AM.GV = Op.getGlobal();
+ } else {
+ AM.Disp = Op.getImm();
+ }
+ addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM)
+ .addReg(MI->getOperand(X86AddrNumOperands).getReg());
+
+ // Reload the original control word now.
+ addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+ F->DeleteMachineInstr(MI); // The pseudo instruction is gone now.
+ return BB;
+ }
+ // String/text processing lowering.
+ case X86::PCMPISTRM128REG:
+ return EmitPCMP(MI, BB, 3, false /* in-mem */);
+ case X86::PCMPISTRM128MEM:
+ return EmitPCMP(MI, BB, 3, true /* in-mem */);
+ case X86::PCMPESTRM128REG:
+ return EmitPCMP(MI, BB, 5, false /* in mem */);
+ case X86::PCMPESTRM128MEM:
+ return EmitPCMP(MI, BB, 5, true /* in mem */);
+
+ // Atomic Lowering.
+ case X86::ATOMAND32:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
+ X86::AND32ri, X86::MOV32rm,
+ X86::LCMPXCHG32, X86::MOV32rr,
+ X86::NOT32r, X86::EAX,
+ X86::GR32RegisterClass);
+ case X86::ATOMOR32:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
+ X86::OR32ri, X86::MOV32rm,
+ X86::LCMPXCHG32, X86::MOV32rr,
+ X86::NOT32r, X86::EAX,
+ X86::GR32RegisterClass);
+ case X86::ATOMXOR32:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
+ X86::XOR32ri, X86::MOV32rm,
+ X86::LCMPXCHG32, X86::MOV32rr,
+ X86::NOT32r, X86::EAX,
+ X86::GR32RegisterClass);
+ case X86::ATOMNAND32:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
+ X86::AND32ri, X86::MOV32rm,
+ X86::LCMPXCHG32, X86::MOV32rr,
+ X86::NOT32r, X86::EAX,
+ X86::GR32RegisterClass, true);
+ case X86::ATOMMIN32:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
+ case X86::ATOMMAX32:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
+ case X86::ATOMUMIN32:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
+ case X86::ATOMUMAX32:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
+
+ case X86::ATOMAND16:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
+ X86::AND16ri, X86::MOV16rm,
+ X86::LCMPXCHG16, X86::MOV16rr,
+ X86::NOT16r, X86::AX,
+ X86::GR16RegisterClass);
+ case X86::ATOMOR16:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
+ X86::OR16ri, X86::MOV16rm,
+ X86::LCMPXCHG16, X86::MOV16rr,
+ X86::NOT16r, X86::AX,
+ X86::GR16RegisterClass);
+ case X86::ATOMXOR16:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
+ X86::XOR16ri, X86::MOV16rm,
+ X86::LCMPXCHG16, X86::MOV16rr,
+ X86::NOT16r, X86::AX,
+ X86::GR16RegisterClass);
+ case X86::ATOMNAND16:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
+ X86::AND16ri, X86::MOV16rm,
+ X86::LCMPXCHG16, X86::MOV16rr,
+ X86::NOT16r, X86::AX,
+ X86::GR16RegisterClass, true);
+ case X86::ATOMMIN16:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
+ case X86::ATOMMAX16:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
+ case X86::ATOMUMIN16:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
+ case X86::ATOMUMAX16:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
+
+ case X86::ATOMAND8:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
+ X86::AND8ri, X86::MOV8rm,
+ X86::LCMPXCHG8, X86::MOV8rr,
+ X86::NOT8r, X86::AL,
+ X86::GR8RegisterClass);
+ case X86::ATOMOR8:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
+ X86::OR8ri, X86::MOV8rm,
+ X86::LCMPXCHG8, X86::MOV8rr,
+ X86::NOT8r, X86::AL,
+ X86::GR8RegisterClass);
+ case X86::ATOMXOR8:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
+ X86::XOR8ri, X86::MOV8rm,
+ X86::LCMPXCHG8, X86::MOV8rr,
+ X86::NOT8r, X86::AL,
+ X86::GR8RegisterClass);
+ case X86::ATOMNAND8:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
+ X86::AND8ri, X86::MOV8rm,
+ X86::LCMPXCHG8, X86::MOV8rr,
+ X86::NOT8r, X86::AL,
+ X86::GR8RegisterClass, true);
+ // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
+ // This group is for 64-bit host.
+ case X86::ATOMAND64:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
+ X86::AND64ri32, X86::MOV64rm,
+ X86::LCMPXCHG64, X86::MOV64rr,
+ X86::NOT64r, X86::RAX,
+ X86::GR64RegisterClass);
+ case X86::ATOMOR64:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
+ X86::OR64ri32, X86::MOV64rm,
+ X86::LCMPXCHG64, X86::MOV64rr,
+ X86::NOT64r, X86::RAX,
+ X86::GR64RegisterClass);
+ case X86::ATOMXOR64:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
+ X86::XOR64ri32, X86::MOV64rm,
+ X86::LCMPXCHG64, X86::MOV64rr,
+ X86::NOT64r, X86::RAX,
+ X86::GR64RegisterClass);
+ case X86::ATOMNAND64:
+ return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
+ X86::AND64ri32, X86::MOV64rm,
+ X86::LCMPXCHG64, X86::MOV64rr,
+ X86::NOT64r, X86::RAX,
+ X86::GR64RegisterClass, true);
+ case X86::ATOMMIN64:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
+ case X86::ATOMMAX64:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
+ case X86::ATOMUMIN64:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
+ case X86::ATOMUMAX64:
+ return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
+
+ // This group does 64-bit operations on a 32-bit host.
+ case X86::ATOMAND6432:
+ return EmitAtomicBit6432WithCustomInserter(MI, BB,
+ X86::AND32rr, X86::AND32rr,
+ X86::AND32ri, X86::AND32ri,
+ false);
+ case X86::ATOMOR6432:
+ return EmitAtomicBit6432WithCustomInserter(MI, BB,
+ X86::OR32rr, X86::OR32rr,
+ X86::OR32ri, X86::OR32ri,
+ false);
+ case X86::ATOMXOR6432:
+ return EmitAtomicBit6432WithCustomInserter(MI, BB,
+ X86::XOR32rr, X86::XOR32rr,
+ X86::XOR32ri, X86::XOR32ri,
+ false);
+ case X86::ATOMNAND6432:
+ return EmitAtomicBit6432WithCustomInserter(MI, BB,
+ X86::AND32rr, X86::AND32rr,
+ X86::AND32ri, X86::AND32ri,
+ true);
+ case X86::ATOMADD6432:
+ return EmitAtomicBit6432WithCustomInserter(MI, BB,
+ X86::ADD32rr, X86::ADC32rr,
+ X86::ADD32ri, X86::ADC32ri,
+ false);
+ case X86::ATOMSUB6432:
+ return EmitAtomicBit6432WithCustomInserter(MI, BB,
+ X86::SUB32rr, X86::SBB32rr,
+ X86::SUB32ri, X86::SBB32ri,
+ false);
+ case X86::ATOMSWAP6432:
+ return EmitAtomicBit6432WithCustomInserter(MI, BB,
+ X86::MOV32rr, X86::MOV32rr,
+ X86::MOV32ri, X86::MOV32ri,
+ false);
+ case X86::VASTART_SAVE_XMM_REGS:
+ return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+ const APInt &Mask,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ unsigned Opc = Op.getOpcode();
+ assert((Opc >= ISD::BUILTIN_OP_END ||
+ Opc == ISD::INTRINSIC_WO_CHAIN ||
+ Opc == ISD::INTRINSIC_W_CHAIN ||
+ Opc == ISD::INTRINSIC_VOID) &&
+ "Should use MaskedValueIsZero if you don't know whether Op"
+ " is a target node!");
+
+ KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything.
+ switch (Opc) {
+ default: break;
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::SMUL:
+ case X86ISD::UMUL:
+ case X86ISD::INC:
+ case X86ISD::DEC:
+ case X86ISD::OR:
+ case X86ISD::XOR:
+ case X86ISD::AND:
+ // These nodes' second result is a boolean.
+ if (Op.getResNo() == 0)
+ break;
+ // Fallthrough
+ case X86ISD::SETCC:
+ KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
+ Mask.getBitWidth() - 1);
+ break;
+ }
+}
+
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + offset.
+bool X86TargetLowering::isGAPlusOffset(SDNode *N,
+ const GlobalValue* &GA,
+ int64_t &Offset) const {
+ if (N->getOpcode() == X86ISD::Wrapper) {
+ if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
+ GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+ Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
+ return true;
+ }
+ }
+ return TargetLowering::isGAPlusOffset(N, GA, Offset);
+}
+
+/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
+/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
+/// if the load addresses are consecutive, non-overlapping, and in the right
+/// order.
+static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
+ const TargetLowering &TLI) {
+ DebugLoc dl = N->getDebugLoc();
+ EVT VT = N->getValueType(0);
+ ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+
+ if (VT.getSizeInBits() != 128)
+ return SDValue();
+
+ SmallVector<SDValue, 16> Elts;
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+ Elts.push_back(DAG.getShuffleScalarElt(SVN, i));
+
+ return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
+}
+
+/// PerformShuffleCombine - Detect vector gather/scatter index generation
+/// and convert it from being a bunch of shuffles and extracts to a simple
+/// store and scalar loads to extract the elements.
+static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
+ const TargetLowering &TLI) {
+ SDValue InputVector = N->getOperand(0);
+
+ // Only operate on vectors of 4 elements, where the alternative shuffling
+ // gets to be more expensive.
+ if (InputVector.getValueType() != MVT::v4i32)
+ return SDValue();
+
+ // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
+ // single use which is a sign-extend or zero-extend, and all elements are
+ // used.
+ SmallVector<SDNode *, 4> Uses;
+ unsigned ExtractedElements = 0;
+ for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
+ UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
+ if (UI.getUse().getResNo() != InputVector.getResNo())
+ return SDValue();
+
+ SDNode *Extract = *UI;
+ if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ if (Extract->getValueType(0) != MVT::i32)
+ return SDValue();
+ if (!Extract->hasOneUse())
+ return SDValue();
+ if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
+ Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
+ return SDValue();
+ if (!isa<ConstantSDNode>(Extract->getOperand(1)))
+ return SDValue();
+
+ // Record which element was extracted.
+ ExtractedElements |=
+ 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
+
+ Uses.push_back(Extract);
+ }
+
+ // If not all the elements were used, this may not be worthwhile.
+ if (ExtractedElements != 15)
+ return SDValue();
+
+ // Ok, we've now decided to do the transformation.
+ DebugLoc dl = InputVector.getDebugLoc();
+
+ // Store the value to a temporary stack slot.
+ SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+ SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 0,
+ false, false, 0);
+
+ // Replace each use (extract) with a load of the appropriate element.
+ for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
+ UE = Uses.end(); UI != UE; ++UI) {
+ SDNode *Extract = *UI;
+
+ // Compute the element's address.
+ SDValue Idx = Extract->getOperand(1);
+ unsigned EltSize =
+ InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
+ uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
+ SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
+
+ SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), OffsetVal, StackPtr);
+
+ // Load the scalar.
+ SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, ScalarAddr,
+ NULL, 0, false, false, 0);
+
+ // Replace the exact with the load.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
+ }
+
+ // The replacement was made in place; don't return anything.
+ return SDValue();
+}
+
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
+static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ DebugLoc DL = N->getDebugLoc();
+ SDValue Cond = N->getOperand(0);
+ // Get the LHS/RHS of the select.
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+
+ // If we have SSE[12] support, try to form min/max nodes. SSE min/max
+ // instructions match the semantics of the common C idiom x<y?x:y but not
+ // x<=y?x:y, because of how they handle negative zero (which can be
+ // ignored in unsafe-math mode).
+ if (Subtarget->hasSSE2() &&
+ (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
+ Cond.getOpcode() == ISD::SETCC) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ unsigned Opcode = 0;
+ // Check for x CC y ? x : y.
+ if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETULT:
+ // Converting this to a min would handle NaNs incorrectly, and swapping
+ // the operands would cause it to handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!FiniteOnlyFPMath() &&
+ (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) {
+ if (!UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETOLE:
+ // Converting this to a min would handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!UnsafeFPMath &&
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+ break;
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETULE:
+ // Converting this to a min would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ case ISD::SETLE:
+ Opcode = X86ISD::FMIN;
+ break;
+
+ case ISD::SETOGE:
+ // Converting this to a max would handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!UnsafeFPMath &&
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS))
+ break;
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETUGT:
+ // Converting this to a max would handle NaNs incorrectly, and swapping
+ // the operands would cause it to handle comparisons between positive
+ // and negative zero incorrectly.
+ if (!FiniteOnlyFPMath() &&
+ (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) {
+ if (!UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETUGE:
+ // Converting this to a max would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ case ISD::SETGE:
+ Opcode = X86ISD::FMAX;
+ break;
+ }
+ // Check for x CC y ? y : x -- a min/max with reversed arms.
+ } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETOGE:
+ // Converting this to a min would handle comparisons between positive
+ // and negative zero incorrectly, and swapping the operands would
+ // cause it to handle NaNs incorrectly.
+ if (!UnsafeFPMath &&
+ !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
+ if (!FiniteOnlyFPMath() &&
+ (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETUGT:
+ // Converting this to a min would handle NaNs incorrectly.
+ if (!UnsafeFPMath &&
+ (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+ break;
+ Opcode = X86ISD::FMIN;
+ break;
+ case ISD::SETUGE:
+ // Converting this to a min would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ case ISD::SETGE:
+ Opcode = X86ISD::FMIN;
+ break;
+
+ case ISD::SETULT:
+ // Converting this to a max would handle NaNs incorrectly.
+ if (!FiniteOnlyFPMath() &&
+ (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+ break;
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETOLE:
+ // Converting this to a max would handle comparisons between positive
+ // and negative zero incorrectly, and swapping the operands would
+ // cause it to handle NaNs incorrectly.
+ if (!UnsafeFPMath &&
+ !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
+ if (!FiniteOnlyFPMath() &&
+ (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+ break;
+ std::swap(LHS, RHS);
+ }
+ Opcode = X86ISD::FMAX;
+ break;
+ case ISD::SETULE:
+ // Converting this to a max would handle both negative zeros and NaNs
+ // incorrectly, but we can swap the operands to fix both.
+ std::swap(LHS, RHS);
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ case ISD::SETLE:
+ Opcode = X86ISD::FMAX;
+ break;
+ }
+ }
+
+ if (Opcode)
+ return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
+ }
+
+ // If this is a select between two integer constants, try to do some
+ // optimizations.
+ if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
+ if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
+ // Don't do this for crazy integer types.
+ if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
+ // If this is efficiently invertible, canonicalize the LHSC/RHSC values
+ // so that TrueC (the true value) is larger than FalseC.
+ bool NeedsCondInvert = false;
+
+ if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
+ // Efficiently invertible.
+ (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
+ (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
+ isa<ConstantSDNode>(Cond.getOperand(1))))) {
+ NeedsCondInvert = true;
+ std::swap(TrueC, FalseC);
+ }
+
+ // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
+ if (FalseC->getAPIntValue() == 0 &&
+ TrueC->getAPIntValue().isPowerOf2()) {
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
+
+ unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+ return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
+ DAG.getConstant(ShAmt, MVT::i8));
+ }
+
+ // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
+ if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+ FalseC->getValueType(0), Cond);
+ return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ }
+
+ // Optimize cases that will turn into an LEA instruction. This requires
+ // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+ if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+ uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+ if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+ bool isFastMultiplier = false;
+ if (Diff < 10) {
+ switch ((unsigned char)Diff) {
+ default: break;
+ case 1: // result = add base, cond
+ case 2: // result = lea base( , cond*2)
+ case 3: // result = lea base(cond, cond*2)
+ case 4: // result = lea base( , cond*4)
+ case 5: // result = lea base(cond, cond*4)
+ case 8: // result = lea base( , cond*8)
+ case 9: // result = lea base(cond, cond*8)
+ isFastMultiplier = true;
+ break;
+ }
+ }
+
+ if (isFastMultiplier) {
+ APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+ if (NeedsCondInvert) // Invert the condition if needed.
+ Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(1, Cond.getValueType()));
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+ Cond);
+ // Scale the condition by the difference.
+ if (Diff != 1)
+ Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(Diff, Cond.getValueType()));
+
+ // Add the base if non-zero.
+ if (FalseC->getAPIntValue() != 0)
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ return Cond;
+ }
+ }
+ }
+ }
+
+ return SDValue();
+}
+
+/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
+static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ DebugLoc DL = N->getDebugLoc();
+
+ // If the flag operand isn't dead, don't touch this CMOV.
+ if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
+ return SDValue();
+
+ // If this is a select between two integer constants, try to do some
+ // optimizations. Note that the operands are ordered the opposite of SELECT
+ // operands.
+ if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+ if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+ // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
+ // larger than FalseC (the false value).
+ X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+
+ if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
+ CC = X86::GetOppositeBranchCondition(CC);
+ std::swap(TrueC, FalseC);
+ }
+
+ // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
+ // This is efficient for any integer data type (including i8/i16) and
+ // shift amount.
+ if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+ SDValue Cond = N->getOperand(3);
+ Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, MVT::i8), Cond);
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
+
+ unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+ Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(ShAmt, MVT::i8));
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+
+ // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
+ // for any integer data type, including i8/i16.
+ if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+ SDValue Cond = N->getOperand(3);
+ Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, MVT::i8), Cond);
+
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+ FalseC->getValueType(0), Cond);
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+
+ // Optimize cases that will turn into an LEA instruction. This requires
+ // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+ if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+ uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+ if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+ bool isFastMultiplier = false;
+ if (Diff < 10) {
+ switch ((unsigned char)Diff) {
+ default: break;
+ case 1: // result = add base, cond
+ case 2: // result = lea base( , cond*2)
+ case 3: // result = lea base(cond, cond*2)
+ case 4: // result = lea base( , cond*4)
+ case 5: // result = lea base(cond, cond*4)
+ case 8: // result = lea base( , cond*8)
+ case 9: // result = lea base(cond, cond*8)
+ isFastMultiplier = true;
+ break;
+ }
+ }
+
+ if (isFastMultiplier) {
+ APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+ SDValue Cond = N->getOperand(3);
+ Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+ DAG.getConstant(CC, MVT::i8), Cond);
+ // Zero extend the condition if needed.
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+ Cond);
+ // Scale the condition by the difference.
+ if (Diff != 1)
+ Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+ DAG.getConstant(Diff, Cond.getValueType()));
+
+ // Add the base if non-zero.
+ if (FalseC->getAPIntValue() != 0)
+ Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+ SDValue(FalseC, 0));
+ if (N->getNumValues() == 2) // Dead flag value?
+ return DCI.CombineTo(N, Cond, SDValue());
+ return Cond;
+ }
+ }
+ }
+ }
+ return SDValue();
+}
+
+
+/// PerformMulCombine - Optimize a single multiply with constant into two
+/// in order to implement it with two cheaper instructions, e.g.
+/// LEA + SHL, LEA + LEA.
+static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i64)
+ return SDValue();
+
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!C)
+ return SDValue();
+ uint64_t MulAmt = C->getZExtValue();
+ if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+ return SDValue();
+
+ uint64_t MulAmt1 = 0;
+ uint64_t MulAmt2 = 0;
+ if ((MulAmt % 9) == 0) {
+ MulAmt1 = 9;
+ MulAmt2 = MulAmt / 9;
+ } else if ((MulAmt % 5) == 0) {
+ MulAmt1 = 5;
+ MulAmt2 = MulAmt / 5;
+ } else if ((MulAmt % 3) == 0) {
+ MulAmt1 = 3;
+ MulAmt2 = MulAmt / 3;
+ }
+ if (MulAmt2 &&
+ (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
+ DebugLoc DL = N->getDebugLoc();
+
+ if (isPowerOf2_64(MulAmt2) &&
+ !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
+ // If second multiplifer is pow2, issue it first. We want the multiply by
+ // 3, 5, or 9 to be folded into the addressing mode unless the lone use
+ // is an add.
+ std::swap(MulAmt1, MulAmt2);
+
+ SDValue NewMul;
+ if (isPowerOf2_64(MulAmt1))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
+ else
+ NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+ DAG.getConstant(MulAmt1, VT));
+
+ if (isPowerOf2_64(MulAmt2))
+ NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+ DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
+ else
+ NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
+ DAG.getConstant(MulAmt2, VT));
+
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, NewMul, false);
+ }
+ return SDValue();
+}
+
+static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+ EVT VT = N0.getValueType();
+
+ // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
+ // since the result of setcc_c is all zero's or all ones.
+ if (N1C && N0.getOpcode() == ISD::AND &&
+ N0.getOperand(1).getOpcode() == ISD::Constant) {
+ SDValue N00 = N0.getOperand(0);
+ if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
+ ((N00.getOpcode() == ISD::ANY_EXTEND ||
+ N00.getOpcode() == ISD::ZERO_EXTEND) &&
+ N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
+ APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+ APInt ShAmt = N1C->getAPIntValue();
+ Mask = Mask.shl(ShAmt);
+ if (Mask != 0)
+ return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+ N00, DAG.getConstant(Mask, VT));
+ }
+ }
+
+ return SDValue();
+}
+
+/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
+/// when possible.
+static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector() && VT.isInteger() &&
+ N->getOpcode() == ISD::SHL)
+ return PerformSHLCombine(N, DAG);
+
+ // On X86 with SSE2 support, we can transform this to a vector shift if
+ // all elements are shifted by the same amount. We can't do this in legalize
+ // because the a constant vector is typically transformed to a constant pool
+ // so we have no knowledge of the shift amount.
+ if (!Subtarget->hasSSE2())
+ return SDValue();
+
+ if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
+ return SDValue();
+
+ SDValue ShAmtOp = N->getOperand(1);
+ EVT EltVT = VT.getVectorElementType();
+ DebugLoc DL = N->getDebugLoc();
+ SDValue BaseShAmt = SDValue();
+ if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned i = 0;
+ for (; i != NumElts; ++i) {
+ SDValue Arg = ShAmtOp.getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ BaseShAmt = Arg;
+ break;
+ }
+ for (; i != NumElts; ++i) {
+ SDValue Arg = ShAmtOp.getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ if (Arg != BaseShAmt) {
+ return SDValue();
+ }
+ }
+ } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
+ cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
+ SDValue InVec = ShAmtOp.getOperand(0);
+ if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+ unsigned NumElts = InVec.getValueType().getVectorNumElements();
+ unsigned i = 0;
+ for (; i != NumElts; ++i) {
+ SDValue Arg = InVec.getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ BaseShAmt = Arg;
+ break;
+ }
+ } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
+ unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
+ if (C->getZExtValue() == SplatIdx)
+ BaseShAmt = InVec.getOperand(1);
+ }
+ }
+ if (BaseShAmt.getNode() == 0)
+ BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
+ DAG.getIntPtrConstant(0));
+ } else
+ return SDValue();
+
+ // The shift amount is an i32.
+ if (EltVT.bitsGT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
+ else if (EltVT.bitsLT(MVT::i32))
+ BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
+
+ // The shift amount is identical so we can do a vector shift.
+ SDValue ValOp = N->getOperand(0);
+ switch (N->getOpcode()) {
+ default:
+ llvm_unreachable("Unknown shift opcode!");
+ break;
+ case ISD::SHL:
+ if (VT == MVT::v2i64)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+ ValOp, BaseShAmt);
+ if (VT == MVT::v4i32)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
+ ValOp, BaseShAmt);
+ if (VT == MVT::v8i16)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
+ ValOp, BaseShAmt);
+ break;
+ case ISD::SRA:
+ if (VT == MVT::v4i32)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
+ ValOp, BaseShAmt);
+ if (VT == MVT::v8i16)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
+ ValOp, BaseShAmt);
+ break;
+ case ISD::SRL:
+ if (VT == MVT::v2i64)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+ ValOp, BaseShAmt);
+ if (VT == MVT::v4i32)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
+ ValOp, BaseShAmt);
+ if (VT == MVT::v8i16)
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
+ ValOp, BaseShAmt);
+ break;
+ }
+ return SDValue();
+}
+
+static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
+ std::swap(N0, N1);
+ if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
+ return SDValue();
+ if (!N0.hasOneUse() || !N1.hasOneUse())
+ return SDValue();
+
+ SDValue ShAmt0 = N0.getOperand(1);
+ if (ShAmt0.getValueType() != MVT::i8)
+ return SDValue();
+ SDValue ShAmt1 = N1.getOperand(1);
+ if (ShAmt1.getValueType() != MVT::i8)
+ return SDValue();
+ if (ShAmt0.getOpcode() == ISD::TRUNCATE)
+ ShAmt0 = ShAmt0.getOperand(0);
+ if (ShAmt1.getOpcode() == ISD::TRUNCATE)
+ ShAmt1 = ShAmt1.getOperand(0);
+
+ DebugLoc DL = N->getDebugLoc();
+ unsigned Opc = X86ISD::SHLD;
+ SDValue Op0 = N0.getOperand(0);
+ SDValue Op1 = N1.getOperand(0);
+ if (ShAmt0.getOpcode() == ISD::SUB) {
+ Opc = X86ISD::SHRD;
+ std::swap(Op0, Op1);
+ std::swap(ShAmt0, ShAmt1);
+ }
+
+ unsigned Bits = VT.getSizeInBits();
+ if (ShAmt1.getOpcode() == ISD::SUB) {
+ SDValue Sum = ShAmt1.getOperand(0);
+ if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
+ if (SumC->getSExtValue() == Bits &&
+ ShAmt1.getOperand(1) == ShAmt0)
+ return DAG.getNode(Opc, DL, VT,
+ Op0, Op1,
+ DAG.getNode(ISD::TRUNCATE, DL,
+ MVT::i8, ShAmt0));
+ }
+ } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
+ ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
+ if (ShAmt0C &&
+ ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
+ return DAG.getNode(Opc, DL, VT,
+ N0.getOperand(0), N1.getOperand(0),
+ DAG.getNode(ISD::TRUNCATE, DL,
+ MVT::i8, ShAmt0));
+ }
+
+ return SDValue();
+}
+
+/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
+static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
+ // the FP state in cases where an emms may be missing.
+ // A preferable solution to the general problem is to figure out the right
+ // places to insert EMMS. This qualifies as a quick hack.
+
+ // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
+ StoreSDNode *St = cast<StoreSDNode>(N);
+ EVT VT = St->getValue().getValueType();
+ if (VT.getSizeInBits() != 64)
+ return SDValue();
+
+ const Function *F = DAG.getMachineFunction().getFunction();
+ bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
+ bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
+ && Subtarget->hasSSE2();
+ if ((VT.isVector() ||
+ (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
+ isa<LoadSDNode>(St->getValue()) &&
+ !cast<LoadSDNode>(St->getValue())->isVolatile() &&
+ St->getChain().hasOneUse() && !St->isVolatile()) {
+ SDNode* LdVal = St->getValue().getNode();
+ LoadSDNode *Ld = 0;
+ int TokenFactorIndex = -1;
+ SmallVector<SDValue, 8> Ops;
+ SDNode* ChainVal = St->getChain().getNode();
+ // Must be a store of a load. We currently handle two cases: the load
+ // is a direct child, and it's under an intervening TokenFactor. It is
+ // possible to dig deeper under nested TokenFactors.
+ if (ChainVal == LdVal)
+ Ld = cast<LoadSDNode>(St->getChain());
+ else if (St->getValue().hasOneUse() &&
+ ChainVal->getOpcode() == ISD::TokenFactor) {
+ for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
+ if (ChainVal->getOperand(i).getNode() == LdVal) {
+ TokenFactorIndex = i;
+ Ld = cast<LoadSDNode>(St->getValue());
+ } else
+ Ops.push_back(ChainVal->getOperand(i));
+ }
+ }
+
+ if (!Ld || !ISD::isNormalLoad(Ld))
+ return SDValue();
+
+ // If this is not the MMX case, i.e. we are just turning i64 load/store
+ // into f64 load/store, avoid the transformation if there are multiple
+ // uses of the loaded value.
+ if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
+ return SDValue();
+
+ DebugLoc LdDL = Ld->getDebugLoc();
+ DebugLoc StDL = N->getDebugLoc();
+ // If we are a 64-bit capable x86, lower to a single movq load/store pair.
+ // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
+ // pair instead.
+ if (Subtarget->is64Bit() || F64IsLegal) {
+ EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+ SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
+ Ld->getBasePtr(), Ld->getSrcValue(),
+ Ld->getSrcValueOffset(), Ld->isVolatile(),
+ Ld->isNonTemporal(), Ld->getAlignment());
+ SDValue NewChain = NewLd.getValue(1);
+ if (TokenFactorIndex != -1) {
+ Ops.push_back(NewChain);
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
+ Ops.size());
+ }
+ return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
+ St->getSrcValue(), St->getSrcValueOffset(),
+ St->isVolatile(), St->isNonTemporal(),
+ St->getAlignment());
+ }
+
+ // Otherwise, lower to two pairs of 32-bit loads / stores.
+ SDValue LoAddr = Ld->getBasePtr();
+ SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
+ DAG.getConstant(4, MVT::i32));
+
+ SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
+ Ld->getSrcValue(), Ld->getSrcValueOffset(),
+ Ld->isVolatile(), Ld->isNonTemporal(),
+ Ld->getAlignment());
+ SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
+ Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
+ Ld->isVolatile(), Ld->isNonTemporal(),
+ MinAlign(Ld->getAlignment(), 4));
+
+ SDValue NewChain = LoLd.getValue(1);
+ if (TokenFactorIndex != -1) {
+ Ops.push_back(LoLd);
+ Ops.push_back(HiLd);
+ NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
+ Ops.size());
+ }
+
+ LoAddr = St->getBasePtr();
+ HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
+ DAG.getConstant(4, MVT::i32));
+
+ SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
+ St->getSrcValue(), St->getSrcValueOffset(),
+ St->isVolatile(), St->isNonTemporal(),
+ St->getAlignment());
+ SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
+ St->getSrcValue(),
+ St->getSrcValueOffset() + 4,
+ St->isVolatile(),
+ St->isNonTemporal(),
+ MinAlign(St->getAlignment(), 4));
+ return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
+ }
+ return SDValue();
+}
+
+/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
+/// X86ISD::FXOR nodes.
+static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+ // F[X]OR(0.0, x) -> x
+ // F[X]OR(x, 0.0) -> x
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(1);
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(0);
+ return SDValue();
+}
+
+/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
+static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
+ // FAND(0.0, x) -> 0.0
+ // FAND(x, 0.0) -> 0.0
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(0);
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+ if (C->getValueAPF().isPosZero())
+ return N->getOperand(1);
+ return SDValue();
+}
+
+static SDValue PerformBTCombine(SDNode *N,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // BT ignores high bits in the bit index operand.
+ SDValue Op1 = N->getOperand(1);
+ if (Op1.hasOneUse()) {
+ unsigned BitWidth = Op1.getValueSizeInBits();
+ APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
+ APInt KnownZero, KnownOne;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+ TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
+ DCI.CommitTargetLoweringOpt(TLO);
+ }
+ return SDValue();
+}
+
+static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue Op = N->getOperand(0);
+ if (Op.getOpcode() == ISD::BIT_CONVERT)
+ Op = Op.getOperand(0);
+ EVT VT = N->getValueType(0), OpVT = Op.getValueType();
+ if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
+ VT.getVectorElementType().getSizeInBits() ==
+ OpVT.getVectorElementType().getSizeInBits()) {
+ return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
+ }
+ return SDValue();
+}
+
+// On X86 and X86-64, atomic operations are lowered to locked instructions.
+// Locked instructions, in turn, have implicit fence semantics (all memory
+// operations are flushed before issuing the locked instruction, and the
+// are not buffered), so we can fold away the common pattern of
+// fence-atomic-fence.
+static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) {
+ SDValue atomic = N->getOperand(0);
+ switch (atomic.getOpcode()) {
+ case ISD::ATOMIC_CMP_SWAP:
+ case ISD::ATOMIC_SWAP:
+ case ISD::ATOMIC_LOAD_ADD:
+ case ISD::ATOMIC_LOAD_SUB:
+ case ISD::ATOMIC_LOAD_AND:
+ case ISD::ATOMIC_LOAD_OR:
+ case ISD::ATOMIC_LOAD_XOR:
+ case ISD::ATOMIC_LOAD_NAND:
+ case ISD::ATOMIC_LOAD_MIN:
+ case ISD::ATOMIC_LOAD_MAX:
+ case ISD::ATOMIC_LOAD_UMIN:
+ case ISD::ATOMIC_LOAD_UMAX:
+ break;
+ default:
+ return SDValue();
+ }
+
+ SDValue fence = atomic.getOperand(0);
+ if (fence.getOpcode() != ISD::MEMBARRIER)
+ return SDValue();
+
+ switch (atomic.getOpcode()) {
+ case ISD::ATOMIC_CMP_SWAP:
+ return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
+ atomic.getOperand(1), atomic.getOperand(2),
+ atomic.getOperand(3));
+ case ISD::ATOMIC_SWAP:
+ case ISD::ATOMIC_LOAD_ADD:
+ case ISD::ATOMIC_LOAD_SUB:
+ case ISD::ATOMIC_LOAD_AND:
+ case ISD::ATOMIC_LOAD_OR:
+ case ISD::ATOMIC_LOAD_XOR:
+ case ISD::ATOMIC_LOAD_NAND:
+ case ISD::ATOMIC_LOAD_MIN:
+ case ISD::ATOMIC_LOAD_MAX:
+ case ISD::ATOMIC_LOAD_UMIN:
+ case ISD::ATOMIC_LOAD_UMAX:
+ return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
+ atomic.getOperand(1), atomic.getOperand(2));
+ default:
+ return SDValue();
+ }
+}
+
+static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
+ // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
+ // (and (i32 x86isd::setcc_carry), 1)
+ // This eliminates the zext. This transformation is necessary because
+ // ISD::SETCC is always legalized to i8.
+ DebugLoc dl = N->getDebugLoc();
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ if (N0.getOpcode() == ISD::AND &&
+ N0.hasOneUse() &&
+ N0.getOperand(0).hasOneUse()) {
+ SDValue N00 = N0.getOperand(0);
+ if (N00.getOpcode() != X86ISD::SETCC_CARRY)
+ return SDValue();
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (!C || C->getZExtValue() != 1)
+ return SDValue();
+ return DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+ N00.getOperand(0), N00.getOperand(1)),
+ DAG.getConstant(1, VT));
+ }
+
+ return SDValue();
+}
+
+SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
+ case ISD::EXTRACT_VECTOR_ELT:
+ return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
+ case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget);
+ case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI);
+ case ISD::MUL: return PerformMulCombine(N, DAG, DCI);
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget);
+ case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget);
+ case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
+ case X86ISD::FXOR:
+ case X86ISD::FOR: return PerformFORCombine(N, DAG);
+ case X86ISD::FAND: return PerformFANDCombine(N, DAG);
+ case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
+ case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
+ case ISD::MEMBARRIER: return PerformMEMBARRIERCombine(N, DAG);
+ case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG);
+ }
+
+ return SDValue();
+}
+
+/// isTypeDesirableForOp - Return true if the target has native support for
+/// the specified value type and it is 'desirable' to use the type for the
+/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+/// instruction encodings are longer and some i16 instructions are slow.
+bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
+ if (!isTypeLegal(VT))
+ return false;
+ if (VT != MVT::i16)
+ return true;
+
+ switch (Opc) {
+ default:
+ return true;
+ case ISD::LOAD:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SUB:
+ case ISD::ADD:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ return false;
+ }
+}
+
+static bool MayFoldLoad(SDValue Op) {
+ return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
+}
+
+static bool MayFoldIntoStore(SDValue Op) {
+ return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
+}
+
+/// IsDesirableToPromoteOp - This method query the target whether it is
+/// beneficial for dag combiner to promote the specified node. If true, it
+/// should return the desired promotion type by reference.
+bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
+ EVT VT = Op.getValueType();
+ if (VT != MVT::i16)
+ return false;
+
+ bool Promote = false;
+ bool Commute = false;
+ switch (Op.getOpcode()) {
+ default: break;
+ case ISD::LOAD: {
+ LoadSDNode *LD = cast<LoadSDNode>(Op);
+ // If the non-extending load has a single use and it's not live out, then it
+ // might be folded.
+ if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
+ Op.hasOneUse()*/) {
+ for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+ UE = Op.getNode()->use_end(); UI != UE; ++UI) {
+ // The only case where we'd want to promote LOAD (rather then it being
+ // promoted as an operand is when it's only use is liveout.
+ if (UI->getOpcode() != ISD::CopyToReg)
+ return false;
+ }
+ }
+ Promote = true;
+ break;
+ }
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::ANY_EXTEND:
+ Promote = true;
+ break;
+ case ISD::SHL:
+ case ISD::SRL: {
+ SDValue N0 = Op.getOperand(0);
+ // Look out for (store (shl (load), x)).
+ if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
+ return false;
+ Promote = true;
+ break;
+ }
+ case ISD::ADD:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ Commute = true;
+ // fallthrough
+ case ISD::SUB: {
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ if (!Commute && MayFoldLoad(N1))
+ return false;
+ // Avoid disabling potential load folding opportunities.
+ if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
+ return false;
+ if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
+ return false;
+ Promote = true;
+ }
+ }
+
+ PVT = MVT::i32;
+ return Promote;
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+static bool LowerToBSwap(CallInst *CI) {
+ // FIXME: this should verify that we are targetting a 486 or better. If not,
+ // we will turn this bswap into something that will be lowered to logical ops
+ // instead of emitting the bswap asm. For now, we don't support 486 or lower
+ // so don't worry about this.
+
+ // Verify this is a simple bswap.
+ if (CI->getNumOperands() != 2 ||
+ CI->getType() != CI->getOperand(1)->getType() ||
+ !CI->getType()->isIntegerTy())
+ return false;
+
+ const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ if (!Ty || Ty->getBitWidth() % 16 != 0)
+ return false;
+
+ // Okay, we can do this xform, do so now.
+ const Type *Tys[] = { Ty };
+ Module *M = CI->getParent()->getParent()->getParent();
+ Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
+
+ Value *Op = CI->getOperand(1);
+ Op = CallInst::Create(Int, Op, CI->getName(), CI);
+
+ CI->replaceAllUsesWith(Op);
+ CI->eraseFromParent();
+ return true;
+}
+
+bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
+ InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+ std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
+
+ std::string AsmStr = IA->getAsmString();
+
+ // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+ SmallVector<StringRef, 4> AsmPieces;
+ SplitString(AsmStr, AsmPieces, "\n"); // ; as separator?
+
+ switch (AsmPieces.size()) {
+ default: return false;
+ case 1:
+ AsmStr = AsmPieces[0];
+ AsmPieces.clear();
+ SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace.
+
+ // bswap $0
+ if (AsmPieces.size() == 2 &&
+ (AsmPieces[0] == "bswap" ||
+ AsmPieces[0] == "bswapq" ||
+ AsmPieces[0] == "bswapl") &&
+ (AsmPieces[1] == "$0" ||
+ AsmPieces[1] == "${0:q}")) {
+ // No need to check constraints, nothing other than the equivalent of
+ // "=r,0" would be valid here.
+ return LowerToBSwap(CI);
+ }
+ // rorw $$8, ${0:w} --> llvm.bswap.i16
+ if (CI->getType()->isIntegerTy(16) &&
+ AsmPieces.size() == 3 &&
+ (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") &&
+ AsmPieces[1] == "$$8," &&
+ AsmPieces[2] == "${0:w}" &&
+ IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
+ AsmPieces.clear();
+ const std::string &Constraints = IA->getConstraintString();
+ SplitString(StringRef(Constraints).substr(5), AsmPieces, ",");
+ std::sort(AsmPieces.begin(), AsmPieces.end());
+ if (AsmPieces.size() == 4 &&
+ AsmPieces[0] == "~{cc}" &&
+ AsmPieces[1] == "~{dirflag}" &&
+ AsmPieces[2] == "~{flags}" &&
+ AsmPieces[3] == "~{fpsr}") {
+ return LowerToBSwap(CI);
+ }
+ }
+ break;
+ case 3:
+ if (CI->getType()->isIntegerTy(64) &&
+ Constraints.size() >= 2 &&
+ Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+ Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+ // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
+ SmallVector<StringRef, 4> Words;
+ SplitString(AsmPieces[0], Words, " \t");
+ if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
+ Words.clear();
+ SplitString(AsmPieces[1], Words, " \t");
+ if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
+ Words.clear();
+ SplitString(AsmPieces[2], Words, " \t,");
+ if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
+ Words[2] == "%edx") {
+ return LowerToBSwap(CI);
+ }
+ }
+ }
+ }
+ break;
+ }
+ return false;
+}
+
+
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+X86TargetLowering::ConstraintType
+X86TargetLowering::getConstraintType(const std::string &Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'A':
+ return C_Register;
+ case 'f':
+ case 'r':
+ case 'R':
+ case 'l':
+ case 'q':
+ case 'Q':
+ case 'x':
+ case 'y':
+ case 'Y':
+ return C_RegisterClass;
+ case 'e':
+ case 'Z':
+ return C_Other;
+ default:
+ break;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+/// LowerXConstraint - try to replace an X constraint, which matches anything,
+/// with another that has more specific requirements based on the type of the
+/// corresponding operand.
+const char *X86TargetLowering::
+LowerXConstraint(EVT ConstraintVT) const {
+ // FP X constraints get lowered to SSE1/2 registers if available, otherwise
+ // 'f' like normal targets.
+ if (ConstraintVT.isFloatingPoint()) {
+ if (Subtarget->hasSSE2())
+ return "Y";
+ if (Subtarget->hasSSE1())
+ return "x";
+ }
+
+ return TargetLowering::LowerXConstraint(ConstraintVT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector. If it is invalid, don't add anything to Ops.
+void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+ char Constraint,
+ bool hasMemory,
+ std::vector<SDValue>&Ops,
+ SelectionDAG &DAG) const {
+ SDValue Result(0, 0);
+
+ switch (Constraint) {
+ default: break;
+ case 'I':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 31) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'J':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 63) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'K':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'N':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getZExtValue() <= 255) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ return;
+ case 'e': {
+ // 32-bit signed value
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ const ConstantInt *CI = C->getConstantIntValue();
+ if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+ C->getSExtValue())) {
+ // Widen to 64 bits here to get it sign extended.
+ Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
+ break;
+ }
+ // FIXME gcc accepts some relocatable values here too, but only in certain
+ // memory models; it's complicated.
+ }
+ return;
+ }
+ case 'Z': {
+ // 32-bit unsigned value
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ const ConstantInt *CI = C->getConstantIntValue();
+ if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+ C->getZExtValue())) {
+ Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+ break;
+ }
+ }
+ // FIXME gcc accepts some relocatable values here too, but only in certain
+ // memory models; it's complicated.
+ return;
+ }
+ case 'i': {
+ // Literal immediates are always ok.
+ if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
+ // Widen to 64 bits here to get it sign extended.
+ Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
+ break;
+ }
+
+ // If we are in non-pic codegen mode, we allow the address of a global (with
+ // an optional displacement) to be used with 'i'.
+ GlobalAddressSDNode *GA = 0;
+ int64_t Offset = 0;
+
+ // Match either (GA), (GA+C), (GA+C1+C2), etc.
+ while (1) {
+ if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
+ Offset += GA->getOffset();
+ break;
+ } else if (Op.getOpcode() == ISD::ADD) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ Offset += C->getZExtValue();
+ Op = Op.getOperand(0);
+ continue;
+ }
+ } else if (Op.getOpcode() == ISD::SUB) {
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ Offset += -C->getZExtValue();
+ Op = Op.getOperand(0);
+ continue;
+ }
+ }
+
+ // Otherwise, this isn't something we can handle, reject it.
+ return;
+ }
+
+ const GlobalValue *GV = GA->getGlobal();
+ // If we require an extra load to get this address, as in PIC mode, we
+ // can't accept it.
+ if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
+ getTargetMachine())))
+ return;
+
+ if (hasMemory)
+ Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
+ else
+ Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset);
+ Result = Op;
+ break;
+ }
+ }
+
+ if (Result.getNode()) {
+ Ops.push_back(Result);
+ return;
+ }
+ return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
+ Ops, DAG);
+}
+
+std::vector<unsigned> X86TargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+ EVT VT) const {
+ if (Constraint.size() == 1) {
+ // FIXME: not handling fp-stack yet!
+ switch (Constraint[0]) { // GCC X86 Constraint Letters
+ default: break; // Unknown constraint letter
+ case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
+ if (Subtarget->is64Bit()) {
+ if (VT == MVT::i32)
+ return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX,
+ X86::ESI, X86::EDI, X86::R8D, X86::R9D,
+ X86::R10D,X86::R11D,X86::R12D,
+ X86::R13D,X86::R14D,X86::R15D,
+ X86::EBP, X86::ESP, 0);
+ else if (VT == MVT::i16)
+ return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX,
+ X86::SI, X86::DI, X86::R8W,X86::R9W,
+ X86::R10W,X86::R11W,X86::R12W,
+ X86::R13W,X86::R14W,X86::R15W,
+ X86::BP, X86::SP, 0);
+ else if (VT == MVT::i8)
+ return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL,
+ X86::SIL, X86::DIL, X86::R8B,X86::R9B,
+ X86::R10B,X86::R11B,X86::R12B,
+ X86::R13B,X86::R14B,X86::R15B,
+ X86::BPL, X86::SPL, 0);
+
+ else if (VT == MVT::i64)
+ return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX,
+ X86::RSI, X86::RDI, X86::R8, X86::R9,
+ X86::R10, X86::R11, X86::R12,
+ X86::R13, X86::R14, X86::R15,
+ X86::RBP, X86::RSP, 0);
+
+ break;
+ }
+ // 32-bit fallthrough
+ case 'Q': // Q_REGS
+ if (VT == MVT::i32)
+ return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
+ else if (VT == MVT::i16)
+ return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
+ else if (VT == MVT::i8)
+ return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0);
+ else if (VT == MVT::i64)
+ return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0);
+ break;
+ }
+ }
+
+ return std::vector<unsigned>();
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+ EVT VT) const {
+ // First, see if this is a constraint that directly corresponds to an LLVM
+ // register class.
+ if (Constraint.size() == 1) {
+ // GCC Constraint Letters
+ switch (Constraint[0]) {
+ default: break;
+ case 'r': // GENERAL_REGS
+ case 'l': // INDEX_REGS
+ if (VT == MVT::i8)
+ return std::make_pair(0U, X86::GR8RegisterClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, X86::GR16RegisterClass);
+ if (VT == MVT::i32 || !Subtarget->is64Bit())
+ return std::make_pair(0U, X86::GR32RegisterClass);
+ return std::make_pair(0U, X86::GR64RegisterClass);
+ case 'R': // LEGACY_REGS
+ if (VT == MVT::i8)
+ return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
+ if (VT == MVT::i16)
+ return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
+ if (VT == MVT::i32 || !Subtarget->is64Bit())
+ return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
+ return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
+ case 'f': // FP Stack registers.
+ // If SSE is enabled for this VT, use f80 to ensure the isel moves the
+ // value to the correct fpstack register class.
+ if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
+ return std::make_pair(0U, X86::RFP32RegisterClass);
+ if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
+ return std::make_pair(0U, X86::RFP64RegisterClass);
+ return std::make_pair(0U, X86::RFP80RegisterClass);
+ case 'y': // MMX_REGS if MMX allowed.
+ if (!Subtarget->hasMMX()) break;
+ return std::make_pair(0U, X86::VR64RegisterClass);
+ case 'Y': // SSE_REGS if SSE2 allowed
+ if (!Subtarget->hasSSE2()) break;
+ // FALL THROUGH.
+ case 'x': // SSE_REGS if SSE1 allowed
+ if (!Subtarget->hasSSE1()) break;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: break;
+ // Scalar SSE types.
+ case MVT::f32:
+ case MVT::i32:
+ return std::make_pair(0U, X86::FR32RegisterClass);
+ case MVT::f64:
+ case MVT::i64:
+ return std::make_pair(0U, X86::FR64RegisterClass);
+ // Vector types.
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v4f32:
+ case MVT::v2f64:
+ return std::make_pair(0U, X86::VR128RegisterClass);
+ }
+ break;
+ }
+ }
+
+ // Use the default implementation in TargetLowering to convert the register
+ // constraint into a member of a register class.
+ std::pair<unsigned, const TargetRegisterClass*> Res;
+ Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+ // Not found as a standard register?
+ if (Res.second == 0) {
+ // Map st(0) -> st(7) -> ST0
+ if (Constraint.size() == 7 && Constraint[0] == '{' &&
+ tolower(Constraint[1]) == 's' &&
+ tolower(Constraint[2]) == 't' &&
+ Constraint[3] == '(' &&
+ (Constraint[4] >= '0' && Constraint[4] <= '7') &&
+ Constraint[5] == ')' &&
+ Constraint[6] == '}') {
+
+ Res.first = X86::ST0+Constraint[4]-'0';
+ Res.second = X86::RFP80RegisterClass;
+ return Res;
+ }
+
+ // GCC allows "st(0)" to be called just plain "st".
+ if (StringRef("{st}").equals_lower(Constraint)) {
+ Res.first = X86::ST0;
+ Res.second = X86::RFP80RegisterClass;
+ return Res;
+ }
+
+ // flags -> EFLAGS
+ if (StringRef("{flags}").equals_lower(Constraint)) {
+ Res.first = X86::EFLAGS;
+ Res.second = X86::CCRRegisterClass;
+ return Res;
+ }
+
+ // 'A' means EAX + EDX.
+ if (Constraint == "A") {
+ Res.first = X86::EAX;
+ Res.second = X86::GR32_ADRegisterClass;
+ return Res;
+ }
+ return Res;
+ }
+
+ // Otherwise, check to see if this is a register class of the wrong value
+ // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
+ // turn into {ax},{dx}.
+ if (Res.second->hasType(VT))
+ return Res; // Correct type already, nothing to do.
+
+ // All of the single-register GCC register classes map their values onto
+ // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we
+ // really want an 8-bit or 32-bit register, map to the appropriate register
+ // class and return the appropriate register.
+ if (Res.second == X86::GR16RegisterClass) {
+ if (VT == MVT::i8) {
+ unsigned DestReg = 0;
+ switch (Res.first) {
+ default: break;
+ case X86::AX: DestReg = X86::AL; break;
+ case X86::DX: DestReg = X86::DL; break;
+ case X86::CX: DestReg = X86::CL; break;
+ case X86::BX: DestReg = X86::BL; break;
+ }
+ if (DestReg) {
+ Res.first = DestReg;
+ Res.second = X86::GR8RegisterClass;
+ }
+ } else if (VT == MVT::i32) {
+ unsigned DestReg = 0;
+ switch (Res.first) {
+ default: break;
+ case X86::AX: DestReg = X86::EAX; break;
+ case X86::DX: DestReg = X86::EDX; break;
+ case X86::CX: DestReg = X86::ECX; break;
+ case X86::BX: DestReg = X86::EBX; break;
+ case X86::SI: DestReg = X86::ESI; break;
+ case X86::DI: DestReg = X86::EDI; break;
+ case X86::BP: DestReg = X86::EBP; break;
+ case X86::SP: DestReg = X86::ESP; break;
+ }
+ if (DestReg) {
+ Res.first = DestReg;
+ Res.second = X86::GR32RegisterClass;
+ }
+ } else if (VT == MVT::i64) {
+ unsigned DestReg = 0;
+ switch (Res.first) {
+ default: break;
+ case X86::AX: DestReg = X86::RAX; break;
+ case X86::DX: DestReg = X86::RDX; break;
+ case X86::CX: DestReg = X86::RCX; break;
+ case X86::BX: DestReg = X86::RBX; break;
+ case X86::SI: DestReg = X86::RSI; break;
+ case X86::DI: DestReg = X86::RDI; break;
+ case X86::BP: DestReg = X86::RBP; break;
+ case X86::SP: DestReg = X86::RSP; break;
+ }
+ if (DestReg) {
+ Res.first = DestReg;
+ Res.second = X86::GR64RegisterClass;
+ }
+ }
+ } else if (Res.second == X86::FR32RegisterClass ||
+ Res.second == X86::FR64RegisterClass ||
+ Res.second == X86::VR128RegisterClass) {
+ // Handle references to XMM physical registers that got mapped into the
+ // wrong class. This can happen with constraints like {xmm0} where the
+ // target independent register mapper will just pick the first match it can
+ // find, ignoring the required type.
+ if (VT == MVT::f32)
+ Res.second = X86::FR32RegisterClass;
+ else if (VT == MVT::f64)
+ Res.second = X86::FR64RegisterClass;
+ else if (X86::VR128RegisterClass->hasType(VT))
+ Res.second = X86::VR128RegisterClass;
+ }
+
+ return Res;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
new file mode 100644
index 0000000..1ef1a7b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
@@ -0,0 +1,821 @@
+//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ISELLOWERING_H
+#define X86ISELLOWERING_H
+
+#include "X86Subtarget.h"
+#include "X86RegisterInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+
+namespace llvm {
+ namespace X86ISD {
+ // X86 Specific DAG Nodes
+ enum NodeType {
+ // Start the numbering where the builtin ops leave off.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ /// BSF - Bit scan forward.
+ /// BSR - Bit scan reverse.
+ BSF,
+ BSR,
+
+ /// SHLD, SHRD - Double shift instructions. These correspond to
+ /// X86::SHLDxx and X86::SHRDxx instructions.
+ SHLD,
+ SHRD,
+
+ /// FAND - Bitwise logical AND of floating point values. This corresponds
+ /// to X86::ANDPS or X86::ANDPD.
+ FAND,
+
+ /// FOR - Bitwise logical OR of floating point values. This corresponds
+ /// to X86::ORPS or X86::ORPD.
+ FOR,
+
+ /// FXOR - Bitwise logical XOR of floating point values. This corresponds
+ /// to X86::XORPS or X86::XORPD.
+ FXOR,
+
+ /// FSRL - Bitwise logical right shift of floating point values. These
+ /// corresponds to X86::PSRLDQ.
+ FSRL,
+
+ /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the
+ /// integer source in memory and FP reg result. This corresponds to the
+ /// X86::FILD*m instructions. It has three inputs (token chain, address,
+ /// and source type) and two outputs (FP value and token chain). FILD_FLAG
+ /// also produces a flag).
+ FILD,
+ FILD_FLAG,
+
+ /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the
+ /// integer destination in memory and a FP reg source. This corresponds
+ /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+ /// has two inputs (token chain and address) and two outputs (int value
+ /// and token chain).
+ FP_TO_INT16_IN_MEM,
+ FP_TO_INT32_IN_MEM,
+ FP_TO_INT64_IN_MEM,
+
+ /// FLD - This instruction implements an extending load to FP stack slots.
+ /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+ /// operand, ptr to load from, and a ValueType node indicating the type
+ /// to load to.
+ FLD,
+
+ /// FST - This instruction implements a truncating store to FP stack
+ /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+ /// chain operand, value to store, address, and a ValueType to store it
+ /// as.
+ FST,
+
+ /// CALL - These operations represent an abstract X86 call
+ /// instruction, which includes a bunch of information. In particular the
+ /// operands of these node are:
+ ///
+ /// #0 - The incoming token chain
+ /// #1 - The callee
+ /// #2 - The number of arg bytes the caller pushes on the stack.
+ /// #3 - The number of arg bytes the callee pops off the stack.
+ /// #4 - The value to pass in AL/AX/EAX (optional)
+ /// #5 - The value to pass in DL/DX/EDX (optional)
+ ///
+ /// The result values of these nodes are:
+ ///
+ /// #0 - The outgoing token chain
+ /// #1 - The first register result value (optional)
+ /// #2 - The second register result value (optional)
+ ///
+ CALL,
+
+ /// RDTSC_DAG - This operation implements the lowering for
+ /// readcyclecounter
+ RDTSC_DAG,
+
+ /// X86 compare and logical compare instructions.
+ CMP, COMI, UCOMI,
+
+ /// X86 bit-test instructions.
+ BT,
+
+ /// X86 SetCC. Operand 0 is condition code, and operand 1 is the flag
+ /// operand produced by a CMP instruction.
+ SETCC,
+
+ // Same as SETCC except it's materialized with a sbb and the value is all
+ // one's or all zero's.
+ SETCC_CARRY,
+
+ /// X86 conditional moves. Operand 0 and operand 1 are the two values
+ /// to select from. Operand 2 is the condition code, and operand 3 is the
+ /// flag operand produced by a CMP or TEST instruction. It also writes a
+ /// flag result.
+ CMOV,
+
+ /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+ /// is the block to branch if condition is true, operand 2 is the
+ /// condition code, and operand 3 is the flag operand produced by a CMP
+ /// or TEST instruction.
+ BRCOND,
+
+ /// Return with a flag operand. Operand 0 is the chain operand, operand
+ /// 1 is the number of bytes of stack to pop.
+ RET_FLAG,
+
+ /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx.
+ REP_STOS,
+
+ /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx.
+ REP_MOVS,
+
+ /// GlobalBaseReg - On Darwin, this node represents the result of the popl
+ /// at function entry, used for PIC code.
+ GlobalBaseReg,
+
+ /// Wrapper - A wrapper node for TargetConstantPool,
+ /// TargetExternalSymbol, and TargetGlobalAddress.
+ Wrapper,
+
+ /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP
+ /// relative displacements.
+ WrapperRIP,
+
+ /// MOVQ2DQ - Copies a 64-bit value from a vector to another vector.
+ /// Can be used to move a vector value from a MMX register to a XMM
+ /// register.
+ MOVQ2DQ,
+
+ /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRB.
+ PEXTRB,
+
+ /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRW.
+ PEXTRW,
+
+ /// INSERTPS - Insert any element of a 4 x float vector into any element
+ /// of a destination 4 x floatvector.
+ INSERTPS,
+
+ /// PINSRB - Insert the lower 8-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRB.
+ PINSRB,
+
+ /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRW.
+ PINSRW, MMX_PINSRW,
+
+ /// PSHUFB - Shuffle 16 8-bit values within a vector.
+ PSHUFB,
+
+ /// FMAX, FMIN - Floating point max and min.
+ ///
+ FMAX, FMIN,
+
+ /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal
+ /// approximation. Note that these typically require refinement
+ /// in order to obtain suitable precision.
+ FRSQRT, FRCP,
+
+ // TLSADDR - Thread Local Storage.
+ TLSADDR,
+
+ // SegmentBaseAddress - The address segment:0
+ SegmentBaseAddress,
+
+ // EH_RETURN - Exception Handling helpers.
+ EH_RETURN,
+
+ /// TC_RETURN - Tail call return.
+ /// operand #0 chain
+ /// operand #1 callee (register or absolute)
+ /// operand #2 stack adjustment
+ /// operand #3 optional in flag
+ TC_RETURN,
+
+ // LCMPXCHG_DAG, LCMPXCHG8_DAG - Compare and swap.
+ LCMPXCHG_DAG,
+ LCMPXCHG8_DAG,
+
+ // FNSTCW16m - Store FP control world into i16 memory.
+ FNSTCW16m,
+
+ // VZEXT_MOVL - Vector move low and zero extend.
+ VZEXT_MOVL,
+
+ // VZEXT_LOAD - Load, scalar_to_vector, and zero extend.
+ VZEXT_LOAD,
+
+ // VSHL, VSRL - Vector logical left / right shift.
+ VSHL, VSRL,
+
+ // CMPPD, CMPPS - Vector double/float comparison.
+ // CMPPD, CMPPS - Vector double/float comparison.
+ CMPPD, CMPPS,
+
+ // PCMP* - Vector integer comparisons.
+ PCMPEQB, PCMPEQW, PCMPEQD, PCMPEQQ,
+ PCMPGTB, PCMPGTW, PCMPGTD, PCMPGTQ,
+
+ // ADD, SUB, SMUL, UMUL, etc. - Arithmetic operations with FLAGS results.
+ ADD, SUB, SMUL, UMUL,
+ INC, DEC, OR, XOR, AND,
+
+ // MUL_IMM - X86 specific multiply by immediate.
+ MUL_IMM,
+
+ // PTEST - Vector bitwise comparisons
+ PTEST,
+
+ // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
+ // according to %al. An operator is needed so that this can be expanded
+ // with control flow.
+ VASTART_SAVE_XMM_REGS,
+
+ // MINGW_ALLOCA - MingW's __alloca call to do stack probing.
+ MINGW_ALLOCA,
+
+ // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG,
+ // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG -
+ // Atomic 64-bit binary operations.
+ ATOMADD64_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ ATOMSUB64_DAG,
+ ATOMOR64_DAG,
+ ATOMXOR64_DAG,
+ ATOMAND64_DAG,
+ ATOMNAND64_DAG,
+ ATOMSWAP64_DAG
+
+ // WARNING: Do not add anything in the end unless you want the node to
+ // have memop! In fact, starting from ATOMADD64_DAG all opcodes will be
+ // thought as target memory ops!
+ };
+ }
+
+ /// Define some predicates that are used for node matching.
+ namespace X86 {
+ /// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+ bool isPSHUFDMask(ShuffleVectorSDNode *N);
+
+ /// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+ bool isPSHUFHWMask(ShuffleVectorSDNode *N);
+
+ /// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+ bool isPSHUFLWMask(ShuffleVectorSDNode *N);
+
+ /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to SHUFP*.
+ bool isSHUFPMask(ShuffleVectorSDNode *N);
+
+ /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+ bool isMOVHLPSMask(ShuffleVectorSDNode *N);
+
+ /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+ /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+ /// <2, 3, 2, 3>
+ bool isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N);
+
+ /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for MOVLP{S|D}.
+ bool isMOVLPMask(ShuffleVectorSDNode *N);
+
+ /// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for MOVHP{S|D}.
+ /// as well as MOVLHPS.
+ bool isMOVLHPSMask(ShuffleVectorSDNode *N);
+
+ /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to UNPCKL.
+ bool isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat = false);
+
+ /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to UNPCKH.
+ bool isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat = false);
+
+ /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+ /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+ /// <0, 0, 1, 1>
+ bool isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N);
+
+ /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+ /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+ /// <2, 2, 3, 3>
+ bool isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N);
+
+ /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVSS,
+ /// MOVSD, and MOVD, i.e. setting the lowest element.
+ bool isMOVLMask(ShuffleVectorSDNode *N);
+
+ /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+ bool isMOVSHDUPMask(ShuffleVectorSDNode *N);
+
+ /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+ bool isMOVSLDUPMask(ShuffleVectorSDNode *N);
+
+ /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVDDUP.
+ bool isMOVDDUPMask(ShuffleVectorSDNode *N);
+
+ /// isPALIGNRMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to PALIGNR.
+ bool isPALIGNRMask(ShuffleVectorSDNode *N);
+
+ /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+ /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
+ /// instructions.
+ unsigned getShuffleSHUFImmediate(SDNode *N);
+
+ /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+ /// the specified VECTOR_SHUFFLE mask with PSHUFHW instruction.
+ unsigned getShufflePSHUFHWImmediate(SDNode *N);
+
+ /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
+ /// the specified VECTOR_SHUFFLE mask with PSHUFLW instruction.
+ unsigned getShufflePSHUFLWImmediate(SDNode *N);
+
+ /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
+ /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
+ unsigned getShufflePALIGNRImmediate(SDNode *N);
+
+ /// isZeroNode - Returns true if Elt is a constant zero or a floating point
+ /// constant +0.0.
+ bool isZeroNode(SDValue Elt);
+
+ /// isOffsetSuitableForCodeModel - Returns true of the given offset can be
+ /// fit into displacement field of the instruction.
+ bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+ bool hasSymbolicDisplacement = true);
+ }
+
+ //===--------------------------------------------------------------------===//
+ // X86TargetLowering - X86 Implementation of the TargetLowering interface
+ class X86TargetLowering : public TargetLowering {
+ public:
+ explicit X86TargetLowering(X86TargetMachine &TM);
+
+ /// getPICBaseSymbol - Return the X86-32 PIC base.
+ MCSymbol *getPICBaseSymbol(const MachineFunction *MF, MCContext &Ctx) const;
+
+ virtual unsigned getJumpTableEncoding() const;
+
+ virtual const MCExpr *
+ LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+ const MachineBasicBlock *MBB, unsigned uid,
+ MCContext &Ctx) const;
+
+ /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
+ /// jumptable.
+ virtual SDValue getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const;
+ virtual const MCExpr *
+ getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+ unsigned JTI, MCContext &Ctx) const;
+
+ /// getStackPtrReg - Return the stack pointer register we are using: either
+ /// ESP or RSP.
+ unsigned getStackPtrReg() const { return X86StackPtr; }
+
+ /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+ /// function arguments in the caller parameter area. For X86, aggregates
+ /// that contains are placed at 16-byte boundaries while the rest are at
+ /// 4-byte boundaries.
+ virtual unsigned getByValTypeAlignment(const Type *Ty) const;
+
+ /// getOptimalMemOpType - Returns the target specific optimal type for load
+ /// and store operations as a result of memset, memcpy, and memmove
+ /// lowering. If DstAlign is zero that means it's safe to destination
+ /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+ /// means there isn't a need to check it against alignment requirement,
+ /// probably because the source does not need to be loaded. If
+ /// 'NonScalarIntSafe' is true, that means it's safe to return a
+ /// non-scalar-integer type, e.g. empty string source, constant, or loaded
+ /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
+ /// constant so it does not need to be loaded.
+ /// It returns EVT::Other if the type should be determined using generic
+ /// target-independent logic.
+ virtual EVT
+ getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+ bool NonScalarIntSafe, bool MemcpyStrSrc,
+ MachineFunction &MF) const;
+
+ /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+ /// unaligned memory accesses. of the specified type.
+ virtual bool allowsUnalignedMemoryAccesses(EVT VT) const {
+ return true;
+ }
+
+ /// LowerOperation - Provide custom lowering hooks for some operations.
+ ///
+ virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+ /// ReplaceNodeResults - Replace the results of node with an illegal result
+ /// type with new values built out of custom code.
+ ///
+ virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+ SelectionDAG &DAG) const;
+
+
+ virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ /// isTypeDesirableForOp - Return true if the target has native support for
+ /// the specified value type and it is 'desirable' to use the type for the
+ /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+ /// instruction encodings are longer and some i16 instructions are slow.
+ virtual bool isTypeDesirableForOp(unsigned Opc, EVT VT) const;
+
+ /// isTypeDesirable - Return true if the target has native support for the
+ /// specified value type and it is 'desirable' to use the type. e.g. On x86
+ /// i16 is legal, but undesirable since i16 instruction encodings are longer
+ /// and some i16 instructions are slow.
+ virtual bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const;
+
+ virtual MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr *MI,
+ MachineBasicBlock *MBB) const;
+
+
+ /// getTargetNodeName - This method returns the name of a target specific
+ /// DAG node.
+ virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+ /// getSetCCResultType - Return the ISD::SETCC ValueType
+ virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+
+ /// computeMaskedBitsForTargetNode - Determine which of the bits specified
+ /// in Mask are known to be either zero or one and return them in the
+ /// KnownZero/KnownOne bitsets.
+ virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+ const APInt &Mask,
+ APInt &KnownZero,
+ APInt &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const;
+
+ virtual bool
+ isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const;
+
+ SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+
+ virtual bool ExpandInlineAsm(CallInst *CI) const;
+
+ ConstraintType getConstraintType(const std::string &Constraint) const;
+
+ std::vector<unsigned>
+ getRegClassForInlineAsmConstraint(const std::string &Constraint,
+ EVT VT) const;
+
+ virtual const char *LowerXConstraint(EVT ConstraintVT) const;
+
+ /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+ /// vector. If it is invalid, don't add anything to Ops. If hasMemory is
+ /// true it means one of the asm constraint of the inline asm instruction
+ /// being processed is 'm'.
+ virtual void LowerAsmOperandForConstraint(SDValue Op,
+ char ConstraintLetter,
+ bool hasMemory,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const;
+
+ /// getRegForInlineAsmConstraint - Given a physical register constraint
+ /// (e.g. {edx}), return the register number and the register class for the
+ /// register. This should only be used for C_Register constraints. On
+ /// error, this returns a register number of 0.
+ std::pair<unsigned, const TargetRegisterClass*>
+ getRegForInlineAsmConstraint(const std::string &Constraint,
+ EVT VT) const;
+
+ /// isLegalAddressingMode - Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type.
+ virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+
+ /// isTruncateFree - Return true if it's free to truncate a value of
+ /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
+ /// register EAX to i16 by referencing its sub-register AX.
+ virtual bool isTruncateFree(const Type *Ty1, const Type *Ty2) const;
+ virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+
+ /// isZExtFree - Return true if any actual instruction that defines a
+ /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+ /// register. This does not necessarily include registers defined in
+ /// unknown ways, such as incoming arguments, or copies from unknown
+ /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+ /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+ /// all instructions that define 32-bit values implicit zero-extend the
+ /// result out to 64 bits.
+ virtual bool isZExtFree(const Type *Ty1, const Type *Ty2) const;
+ virtual bool isZExtFree(EVT VT1, EVT VT2) const;
+
+ /// isNarrowingProfitable - Return true if it's profitable to narrow
+ /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
+ /// from i32 to i8 but not from i32 to i16.
+ virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const;
+
+ /// isFPImmLegal - Returns true if the target can instruction select the
+ /// specified FP immediate natively. If false, the legalizer will
+ /// materialize the FP immediate as a load from a constant pool.
+ virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+
+ /// isShuffleMaskLegal - Targets can use this to indicate that they only
+ /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+ /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
+ /// values are assumed to be legal.
+ virtual bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+ EVT VT) const;
+
+ /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is
+ /// used by Targets can use this to indicate if there is a suitable
+ /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant
+ /// pool entry.
+ virtual bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+ EVT VT) const;
+
+ /// ShouldShrinkFPConstant - If true, then instruction selection should
+ /// seek to shrink the FP constant of the specified type to a smaller type
+ /// in order to save space and / or reduce runtime.
+ virtual bool ShouldShrinkFPConstant(EVT VT) const {
+ // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
+ // expensive than a straight movsd. On the other hand, it's important to
+ // shrink long double fp constant since fldt is very slow.
+ return !X86ScalarSSEf64 || VT == MVT::f80;
+ }
+
+ const X86Subtarget* getSubtarget() const {
+ return Subtarget;
+ }
+
+ /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+ /// computed in an SSE register, not on the X87 floating point stack.
+ bool isScalarFPTypeInSSEReg(EVT VT) const {
+ return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+ (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1
+ }
+
+ /// createFastISel - This method returns a target specific FastISel object,
+ /// or null if the target does not support "fast" ISel.
+ virtual FastISel *
+ createFastISel(MachineFunction &mf,
+ DenseMap<const Value *, unsigned> &,
+ DenseMap<const BasicBlock *, MachineBasicBlock *> &,
+ DenseMap<const AllocaInst *, int> &,
+ std::vector<std::pair<MachineInstr*, unsigned> > &
+#ifndef NDEBUG
+ , SmallSet<const Instruction *, 8> &
+#endif
+ ) const;
+
+ /// getFunctionAlignment - Return the Log2 alignment of this function.
+ virtual unsigned getFunctionAlignment(const Function *F) const;
+
+ private:
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+ const X86RegisterInfo *RegInfo;
+ const TargetData *TD;
+
+ /// X86StackPtr - X86 physical register used as stack ptr.
+ unsigned X86StackPtr;
+
+ /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
+ /// floating point ops.
+ /// When SSE is available, use it for f32 operations.
+ /// When SSE2 is available, use it for f64 operations.
+ bool X86ScalarSSEf32;
+ bool X86ScalarSSEf64;
+
+ /// LegalFPImmediates - A list of legal fp immediates.
+ std::vector<APFloat> LegalFPImmediates;
+
+ /// addLegalFPImmediate - Indicate that this x86 target can instruction
+ /// select the specified FP immediate natively.
+ void addLegalFPImmediate(const APFloat& Imm) {
+ LegalFPImmediates.push_back(Imm);
+ }
+
+ SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+ SDValue LowerMemArgument(SDValue Chain,
+ CallingConv::ID CallConv,
+ const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+ DebugLoc dl, SelectionDAG &DAG,
+ const CCValAssign &VA, MachineFrameInfo *MFI,
+ unsigned i) const;
+ SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+ DebugLoc dl, SelectionDAG &DAG,
+ const CCValAssign &VA,
+ ISD::ArgFlagsTy Flags) const;
+
+ // Call lowering helpers.
+
+ /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+ /// for tail call optimization. Targets which want to do tail call
+ /// optimization should implement this function.
+ bool IsEligibleForTailCallOptimization(SDValue Callee,
+ CallingConv::ID CalleeCC,
+ bool isVarArg,
+ bool isCalleeStructRet,
+ bool isCallerStructRet,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ SelectionDAG& DAG) const;
+ bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const;
+ SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
+ SDValue Chain, bool IsTailCall, bool Is64Bit,
+ int FPDiff, DebugLoc dl) const;
+
+ CCAssignFn *CCAssignFnForNode(CallingConv::ID CallConv) const;
+ unsigned GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG &DAG) const;
+
+ std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+ bool isSigned) const;
+
+ SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+ SelectionDAG &DAG) const;
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+ int64_t Offset, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
+ SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
+ SelectionDAG &DAG) const;
+ SDValue LowerBIT_CONVERT(SDValue op, SelectionDAG &DAG) const;
+ SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerToBT(SDValue And, ISD::CondCode CC,
+ DebugLoc dl, SelectionDAG &DAG) const;
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
+
+ virtual SDValue
+ LowerFormalArguments(SDValue Chain,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+ virtual SDValue
+ LowerCall(SDValue Chain, SDValue Callee,
+ CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ DebugLoc dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const;
+
+ virtual SDValue
+ LowerReturn(SDValue Chain,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ DebugLoc dl, SelectionDAG &DAG) const;
+
+ virtual bool
+ CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<EVT> &OutTys,
+ const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
+ SelectionDAG &DAG) const;
+
+ void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG, unsigned NewOp) const;
+
+ /// Utility function to emit string processing sse4.2 instructions
+ /// that return in xmm0.
+ /// This takes the instruction to expand, the associated machine basic
+ /// block, the number of args, and whether or not the second arg is
+ /// in memory or not.
+ MachineBasicBlock *EmitPCMP(MachineInstr *BInstr, MachineBasicBlock *BB,
+ unsigned argNum, bool inMem) const;
+
+ /// Utility function to emit atomic bitwise operations (and, or, xor).
+ /// It takes the bitwise instruction to expand, the associated machine basic
+ /// block, and the associated X86 opcodes for reg/reg and reg/imm.
+ MachineBasicBlock *EmitAtomicBitwiseWithCustomInserter(
+ MachineInstr *BInstr,
+ MachineBasicBlock *BB,
+ unsigned regOpc,
+ unsigned immOpc,
+ unsigned loadOpc,
+ unsigned cxchgOpc,
+ unsigned copyOpc,
+ unsigned notOpc,
+ unsigned EAXreg,
+ TargetRegisterClass *RC,
+ bool invSrc = false) const;
+
+ MachineBasicBlock *EmitAtomicBit6432WithCustomInserter(
+ MachineInstr *BInstr,
+ MachineBasicBlock *BB,
+ unsigned regOpcL,
+ unsigned regOpcH,
+ unsigned immOpcL,
+ unsigned immOpcH,
+ bool invSrc = false) const;
+
+ /// Utility function to emit atomic min and max. It takes the min/max
+ /// instruction to expand, the associated basic block, and the associated
+ /// cmov opcode for moving the min or max value.
+ MachineBasicBlock *EmitAtomicMinMaxWithCustomInserter(MachineInstr *BInstr,
+ MachineBasicBlock *BB,
+ unsigned cmovOpc) const;
+
+ /// Utility function to emit the xmm reg save portion of va_start.
+ MachineBasicBlock *EmitVAStartSaveXMMRegsWithCustomInserter(
+ MachineInstr *BInstr,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredSelect(MachineInstr *I,
+ MachineBasicBlock *BB) const;
+
+ MachineBasicBlock *EmitLoweredMingwAlloca(MachineInstr *MI,
+ MachineBasicBlock *BB) const;
+
+ /// Emit nodes that will be selected as "test Op0,Op0", or something
+ /// equivalent, for use with the given x86 condition code.
+ SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG) const;
+
+ /// Emit nodes that will be selected as "cmp Op0,Op1", or something
+ /// equivalent, for use with the given x86 condition code.
+ SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+ SelectionDAG &DAG) const;
+ };
+
+ namespace X86 {
+ FastISel *createFastISel(MachineFunction &mf,
+ DenseMap<const Value *, unsigned> &,
+ DenseMap<const BasicBlock *, MachineBasicBlock *> &,
+ DenseMap<const AllocaInst *, int> &,
+ std::vector<std::pair<MachineInstr*, unsigned> > &
+#ifndef NDEBUG
+ , SmallSet<const Instruction*, 8> &
+#endif
+ );
+ }
+}
+
+#endif // X86ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/X86/X86Instr64bit.td b/contrib/llvm/lib/Target/X86/X86Instr64bit.td
new file mode 100644
index 0000000..97eb17c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Instr64bit.td
@@ -0,0 +1,2376 @@
+//====- X86Instr64bit.td - Describe X86-64 Instructions ----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86-64 instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// 64-bits but only 32 bits are significant.
+def i64i32imm : Operand<i64> {
+ let ParserMatchClass = ImmSExti64i32AsmOperand;
+}
+
+// 64-bits but only 32 bits are significant, and those bits are treated as being
+// pc relative.
+def i64i32imm_pcrel : Operand<i64> {
+ let PrintMethod = "print_pcrel_imm";
+ let ParserMatchClass = X86AbsMemAsmOperand;
+}
+
+
+// 64-bits but only 8 bits are significant.
+def i64i8imm : Operand<i64> {
+ let ParserMatchClass = ImmSExti64i8AsmOperand;
+}
+
+// Special i64mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i64mem_TC : Operand<i64> {
+ let PrintMethod = "printi64mem";
+ let MIOperandInfo = (ops GR64_TC, i8imm, GR64_TC, i32imm, i8imm);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+
+def lea64mem : Operand<i64> {
+ let PrintMethod = "printlea64mem";
+ let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm);
+ let ParserMatchClass = X86NoSegMemAsmOperand;
+}
+
+def lea64_32mem : Operand<i32> {
+ let PrintMethod = "printlea64_32mem";
+ let AsmOperandLowerMethod = "lower_lea64_32mem";
+ let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm);
+ let ParserMatchClass = X86NoSegMemAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+// Complex Pattern Definitions.
+//
+def lea64addr : ComplexPattern<i64, 4, "SelectLEAAddr",
+ [add, sub, mul, X86mul_imm, shl, or, frameindex,
+ X86WrapperRIP], []>;
+
+def tls64addr : ComplexPattern<i64, 4, "SelectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments.
+//
+
+def i64immSExt8 : PatLeaf<(i64 immSext8)>;
+
+def GetLo32XForm : SDNodeXForm<imm, [{
+ // Transformation function: get the low 32 bits.
+ return getI32Imm((unsigned)N->getZExtValue());
+}]>;
+
+def i64immSExt32 : PatLeaf<(i64 imm), [{
+ // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+ // sign extended field.
+ return (int64_t)N->getZExtValue() == (int32_t)N->getZExtValue();
+}]>;
+
+
+def i64immZExt32 : PatLeaf<(i64 imm), [{
+ // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+ // unsignedsign extended field.
+ return (uint64_t)N->getZExtValue() == (uint32_t)N->getZExtValue();
+}]>;
+
+def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
+def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
+def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
+def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
+def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
+def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list...
+//
+
+// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [RSP, EFLAGS], Uses = [RSP] in {
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+ "#ADJCALLSTACKDOWN",
+ [(X86callseq_start timm:$amt)]>,
+ Requires<[In64BitMode]>;
+def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKUP",
+ [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+ Requires<[In64BitMode]>;
+}
+
+// Interrupt Instructions
+def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iret{q}", []>;
+
+//===----------------------------------------------------------------------===//
+// Call Instructions...
+//
+let isCall = 1 in
+ // All calls clobber the non-callee saved registers. RSP is marked as
+ // a use to prevent stack-pointer assignments that appear immediately
+ // before calls from potentially appearing dead. Uses for argument
+ // registers are added manually.
+ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ Uses = [RSP] in {
+
+ // NOTE: this pattern doesn't match "X86call imm", because we do not know
+ // that the offset between an arbitrary immediate and the call will fit in
+ // the 32-bit pcrel field that we have.
+ def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
+ (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
+ "call{q}\t$dst", []>,
+ Requires<[In64BitMode, NotWin64]>;
+ def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
+ "call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
+ Requires<[NotWin64]>;
+ def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops),
+ "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
+ Requires<[NotWin64]>;
+
+ def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
+ "lcall{q}\t{*}$dst", []>;
+ }
+
+ // FIXME: We need to teach codegen about single list of call-clobbered
+ // registers.
+let isCall = 1 in
+ // All calls clobber the non-callee saved registers. RSP is marked as
+ // a use to prevent stack-pointer assignments that appear immediately
+ // before calls from potentially appearing dead. Uses for argument
+ // registers are added manually.
+ let Defs = [RAX, RCX, RDX, R8, R9, R10, R11,
+ FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, EFLAGS],
+ Uses = [RSP] in {
+ def WINCALL64pcrel32 : I<0xE8, RawFrm,
+ (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
+ "call\t$dst", []>,
+ Requires<[IsWin64]>;
+ def WINCALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
+ "call\t{*}$dst",
+ [(X86call GR64:$dst)]>, Requires<[IsWin64]>;
+ def WINCALL64m : I<0xFF, MRM2m, (outs),
+ (ins i64mem:$dst, variable_ops), "call\t{*}$dst",
+ [(X86call (loadi64 addr:$dst))]>,
+ Requires<[IsWin64]>;
+ }
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ Uses = [RSP] in {
+ def TCRETURNdi64 : I<0, Pseudo, (outs),
+ (ins i64i32imm_pcrel:$dst, i32imm:$offset, variable_ops),
+ "#TC_RETURN $dst $offset", []>;
+ def TCRETURNri64 : I<0, Pseudo, (outs), (ins GR64_TC:$dst, i32imm:$offset,
+ variable_ops),
+ "#TC_RETURN $dst $offset", []>;
+ let mayLoad = 1 in
+ def TCRETURNmi64 : I<0, Pseudo, (outs),
+ (ins i64mem_TC:$dst, i32imm:$offset, variable_ops),
+ "#TC_RETURN $dst $offset", []>;
+
+ def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs),
+ (ins i64i32imm_pcrel:$dst, variable_ops),
+ "jmp\t$dst # TAILCALL", []>;
+ def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins GR64_TC:$dst, variable_ops),
+ "jmp{q}\t{*}$dst # TAILCALL", []>;
+
+ let mayLoad = 1 in
+ def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst, variable_ops),
+ "jmp{q}\t{*}$dst # TAILCALL", []>;
+}
+
+// Branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+ def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst),
+ "jmp{q}\t$dst", []>;
+ def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
+ [(brind GR64:$dst)]>;
+ def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
+ [(brind (loadi64 addr:$dst))]>;
+ def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
+ "ljmp{q}\t{*}$dst", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
+ "ret\t#eh_return, addr: $addr",
+ [(X86ehret GR64:$addr)]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions...
+//
+
+def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "popcnt{q}\t{$src, $dst|$dst, $src}", []>, XS;
+let mayLoad = 1 in
+def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "popcnt{q}\t{$src, $dst|$dst, $src}", []>, XS;
+
+let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in
+def LEAVE64 : I<0xC9, RawFrm,
+ (outs), (ins), "leave", []>;
+let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in {
+let mayLoad = 1 in {
+def POP64r : I<0x58, AddRegFrm,
+ (outs GR64:$reg), (ins), "pop{q}\t$reg", []>;
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>;
+def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", []>;
+}
+let mayStore = 1 in {
+def PUSH64r : I<0x50, AddRegFrm,
+ (outs), (ins GR64:$reg), "push{q}\t$reg", []>;
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", []>;
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>;
+}
+}
+
+let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1 in {
+def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i8imm:$imm),
+ "push{q}\t$imm", []>;
+def PUSH64i16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
+ "push{q}\t$imm", []>;
+def PUSH64i32 : Ii32<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
+ "push{q}\t$imm", []>;
+}
+
+let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in
+def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", []>,
+ Requires<[In64BitMode]>;
+let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in
+def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", []>,
+ Requires<[In64BitMode]>;
+
+def LEA64_32r : I<0x8D, MRMSrcMem,
+ (outs GR32:$dst), (ins lea64_32mem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea32addr:$src)]>, Requires<[In64BitMode]>;
+
+let isReMaterializable = 1 in
+def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
+ "lea{q}\t{$src|$dst}, {$dst|$src}",
+ [(set GR64:$dst, lea64addr:$src)]>;
+
+let isTwoAddress = 1 in
+def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
+ "bswap{q}\t$dst",
+ [(set GR64:$dst, (bswap GR64:$src))]>, TB;
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "bsf{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>, TB;
+def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "bsf{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>, TB;
+
+def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "bsr{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>, TB;
+def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "bsr{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>, TB;
+} // Defs = [EFLAGS]
+
+// Repeat string ops
+let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in
+def REP_MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
+ [(X86rep_movs i64)]>, REP;
+let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI], isCodeGenOnly = 1 in
+def REP_STOSQ : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
+ [(X86rep_stos i64)]>, REP;
+
+let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in
+def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", []>;
+
+let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in
+def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", []>;
+
+def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", []>;
+
+def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", []>;
+
+// Fast system-call instructions
+def SYSEXIT64 : RI<0x35, RawFrm,
+ (outs), (ins), "sysexit", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// Move Instructions...
+//
+
+let neverHasSideEffects = 1 in
+def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
+ "movabs{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, imm:$src)]>;
+def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, i64immSExt32:$src)]>;
+}
+
+// The assembler accepts movq of a 64-bit immediate as an alternate spelling of
+// movabsq.
+let isAsmParserOnly = 1 in {
+def MOV64ri_alt : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>;
+}
+
+let isCodeGenOnly = 1 in {
+def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (load addr:$src))]>;
+
+def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(store GR64:$src, addr:$dst)]>;
+def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ [(store i64immSExt32:$src, addr:$dst)]>;
+
+/// Versions of MOV64rr, MOV64rm, and MOV64mr for i64mem_TC and GR64_TC.
+let neverHasSideEffects = 1 in
+def MOV64rr_TC : RI<0x89, MRMDestReg, (outs GR64_TC:$dst), (ins GR64_TC:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>;
+
+let mayLoad = 1,
+ canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOV64rm_TC : RI<0x8B, MRMSrcMem, (outs GR64_TC:$dst), (ins i64mem_TC:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ []>;
+
+let mayStore = 1 in
+def MOV64mr_TC : RI<0x89, MRMDestMem, (outs), (ins i64mem_TC:$dst, GR64_TC:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}",
+ []>;
+
+def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins offset8:$src),
+ "mov{q}\t{$src, %rax|%rax, $src}", []>;
+def MOV64o64a : RIi32<0xA1, RawFrm, (outs), (ins offset64:$src),
+ "mov{q}\t{$src, %rax|%rax, $src}", []>;
+def MOV64ao8 : RIi8<0xA2, RawFrm, (outs offset8:$dst), (ins),
+ "mov{q}\t{%rax, $dst|$dst, %rax}", []>;
+def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins),
+ "mov{q}\t{%rax, $dst|$dst, %rax}", []>;
+
+// Moves to and from segment registers
+def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>;
+def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>;
+def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>;
+def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>;
+
+// Moves to and from debug registers
+def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+
+// Moves to and from control registers
+def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
+ "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+
+// Sign/Zero extenders
+
+// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
+// operand, which makes it a rare instruction with an 8-bit register
+// operand that can never access an h register. If support for h registers
+// were generalized, this would require a special register class.
+def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+ "movs{bq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR8:$src))]>, TB;
+def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+ "movs{bq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i8 addr:$src))]>, TB;
+def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+ "movs{wq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR16:$src))]>, TB;
+def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "movs{wq|x}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i16 addr:$src))]>, TB;
+def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR32:$src))]>;
+def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+ "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i32 addr:$src))]>;
+
+// movzbq and movzwq encodings for the disassembler
+def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
+ "movz{bq|x}\t{$src, $dst|$dst, $src}", []>, TB;
+def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
+ "movz{bq|x}\t{$src, $dst|$dst, $src}", []>, TB;
+def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+ "movz{wq|x}\t{$src, $dst|$dst, $src}", []>, TB;
+def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "movz{wq|x}\t{$src, $dst|$dst, $src}", []>, TB;
+
+// Use movzbl instead of movzbq when the destination is a register; it's
+// equivalent due to implicit zero-extending, and it has a smaller encoding.
+def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+ "", [(set GR64:$dst, (zext GR8:$src))]>, TB;
+def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+ "", [(set GR64:$dst, (zextloadi64i8 addr:$src))]>, TB;
+// Use movzwl instead of movzwq when the destination is a register; it's
+// equivalent due to implicit zero-extending, and it has a smaller encoding.
+def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+ "", [(set GR64:$dst, (zext GR16:$src))]>, TB;
+def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "", [(set GR64:$dst, (zextloadi64i16 addr:$src))]>, TB;
+
+// There's no movzlq instruction, but movl can be used for this purpose, using
+// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero
+// extension on x86-64 is to use a SUBREG_TO_REG to utilize implicit
+// zero-extension, however this isn't possible when the 32-bit value is
+// defined by a truncate or is copied from something where the high bits aren't
+// necessarily all zero. In such cases, we fall back to these explicit zext
+// instructions.
+def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src),
+ "", [(set GR64:$dst, (zext GR32:$src))]>;
+def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+ "", [(set GR64:$dst, (zextloadi64i32 addr:$src))]>;
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. And x86's cmov doesn't do anything if the
+// condition is false. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+def def32 : PatLeaf<(i32 GR32:$src), [{
+ return N->getOpcode() != ISD::TRUNCATE &&
+ N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+ N->getOpcode() != ISD::CopyFromReg &&
+ N->getOpcode() != X86ISD::CMOV;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)),
+ (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+let neverHasSideEffects = 1 in {
+ let Defs = [RAX], Uses = [EAX] in
+ def CDQE : RI<0x98, RawFrm, (outs), (ins),
+ "{cltq|cdqe}", []>; // RAX = signext(EAX)
+
+ let Defs = [RAX,RDX], Uses = [RAX] in
+ def CQO : RI<0x99, RawFrm, (outs), (ins),
+ "{cqto|cqo}", []>; // RDX:RAX = signext(RAX)
+}
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions...
+//
+
+let Defs = [EFLAGS] in {
+
+def ADD64i32 : RIi32<0x05, RawFrm, (outs), (ins i64i32imm:$src),
+ "add{q}\t{$src, %rax|%rax, $src}", []>;
+
+let isTwoAddress = 1 in {
+let isConvertibleToThreeAddress = 1 in {
+let isCommutable = 1 in
+// Register-Register Addition
+def ADD64rr : RI<0x01, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "add{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86add_flag GR64:$src1, GR64:$src2))]>;
+
+// These are alternate spellings for use by the disassembler, we mark them as
+// code gen only to ensure they aren't matched by the assembler.
+let isCodeGenOnly = 1 in {
+ def ADD64rr_alt : RI<0x03, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}", []>;
+}
+
+// Register-Integer Addition
+def ADD64ri8 : RIi8<0x83, MRM0r, (outs GR64:$dst),
+ (ins GR64:$src1, i64i8imm:$src2),
+ "add{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86add_flag GR64:$src1, i64immSExt8:$src2))]>;
+def ADD64ri32 : RIi32<0x81, MRM0r, (outs GR64:$dst),
+ (ins GR64:$src1, i64i32imm:$src2),
+ "add{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86add_flag GR64:$src1, i64immSExt32:$src2))]>;
+} // isConvertibleToThreeAddress
+
+// Register-Memory Addition
+def ADD64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "add{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86add_flag GR64:$src1, (load addr:$src2)))]>;
+
+} // isTwoAddress
+
+// Memory-Register Addition
+def ADD64mr : RI<0x01, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "add{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), GR64:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+def ADD64mi8 : RIi8<0x83, MRM0m, (outs), (ins i64mem:$dst, i64i8imm :$src2),
+ "add{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), i64immSExt8:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+def ADD64mi32 : RIi32<0x81, MRM0m, (outs), (ins i64mem:$dst, i64i32imm :$src2),
+ "add{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), i64immSExt32:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+
+let Uses = [EFLAGS] in {
+
+def ADC64i32 : RIi32<0x15, RawFrm, (outs), (ins i64i32imm:$src),
+ "adc{q}\t{$src, %rax|%rax, $src}", []>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def ADC64rr : RI<0x11, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "adc{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (adde GR64:$src1, GR64:$src2))]>;
+
+let isCodeGenOnly = 1 in {
+def ADC64rr_REV : RI<0x13, MRMSrcReg , (outs GR32:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "adc{q}\t{$src2, $dst|$dst, $src2}", []>;
+}
+
+def ADC64rm : RI<0x13, MRMSrcMem , (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "adc{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (adde GR64:$src1, (load addr:$src2)))]>;
+
+def ADC64ri8 : RIi8<0x83, MRM2r, (outs GR64:$dst),
+ (ins GR64:$src1, i64i8imm:$src2),
+ "adc{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (adde GR64:$src1, i64immSExt8:$src2))]>;
+def ADC64ri32 : RIi32<0x81, MRM2r, (outs GR64:$dst),
+ (ins GR64:$src1, i64i32imm:$src2),
+ "adc{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (adde GR64:$src1, i64immSExt32:$src2))]>;
+} // isTwoAddress
+
+def ADC64mr : RI<0x11, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "adc{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def ADC64mi8 : RIi8<0x83, MRM2m, (outs), (ins i64mem:$dst, i64i8imm :$src2),
+ "adc{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), i64immSExt8:$src2),
+ addr:$dst)]>;
+def ADC64mi32 : RIi32<0x81, MRM2m, (outs), (ins i64mem:$dst, i64i32imm:$src2),
+ "adc{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), i64immSExt32:$src2),
+ addr:$dst)]>;
+} // Uses = [EFLAGS]
+
+let isTwoAddress = 1 in {
+// Register-Register Subtraction
+def SUB64rr : RI<0x29, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "sub{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86sub_flag GR64:$src1, GR64:$src2))]>;
+
+let isCodeGenOnly = 1 in {
+def SUB64rr_REV : RI<0x2B, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "sub{q}\t{$src2, $dst|$dst, $src2}", []>;
+}
+
+// Register-Memory Subtraction
+def SUB64rm : RI<0x2B, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "sub{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86sub_flag GR64:$src1, (load addr:$src2)))]>;
+
+// Register-Integer Subtraction
+def SUB64ri8 : RIi8<0x83, MRM5r, (outs GR64:$dst),
+ (ins GR64:$src1, i64i8imm:$src2),
+ "sub{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86sub_flag GR64:$src1, i64immSExt8:$src2))]>;
+def SUB64ri32 : RIi32<0x81, MRM5r, (outs GR64:$dst),
+ (ins GR64:$src1, i64i32imm:$src2),
+ "sub{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86sub_flag GR64:$src1, i64immSExt32:$src2))]>;
+} // isTwoAddress
+
+def SUB64i32 : RIi32<0x2D, RawFrm, (outs), (ins i64i32imm:$src),
+ "sub{q}\t{$src, %rax|%rax, $src}", []>;
+
+// Memory-Register Subtraction
+def SUB64mr : RI<0x29, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "sub{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), GR64:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+
+// Memory-Integer Subtraction
+def SUB64mi8 : RIi8<0x83, MRM5m, (outs), (ins i64mem:$dst, i64i8imm :$src2),
+ "sub{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), i64immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)]>;
+def SUB64mi32 : RIi32<0x81, MRM5m, (outs), (ins i64mem:$dst, i64i32imm:$src2),
+ "sub{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), i64immSExt32:$src2),
+ addr:$dst),
+ (implicit EFLAGS)]>;
+
+let Uses = [EFLAGS] in {
+let isTwoAddress = 1 in {
+def SBB64rr : RI<0x19, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "sbb{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sube GR64:$src1, GR64:$src2))]>;
+
+let isCodeGenOnly = 1 in {
+def SBB64rr_REV : RI<0x1B, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "sbb{q}\t{$src2, $dst|$dst, $src2}", []>;
+}
+
+def SBB64rm : RI<0x1B, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "sbb{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sube GR64:$src1, (load addr:$src2)))]>;
+
+def SBB64ri8 : RIi8<0x83, MRM3r, (outs GR64:$dst),
+ (ins GR64:$src1, i64i8imm:$src2),
+ "sbb{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sube GR64:$src1, i64immSExt8:$src2))]>;
+def SBB64ri32 : RIi32<0x81, MRM3r, (outs GR64:$dst),
+ (ins GR64:$src1, i64i32imm:$src2),
+ "sbb{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sube GR64:$src1, i64immSExt32:$src2))]>;
+} // isTwoAddress
+
+def SBB64i32 : RIi32<0x1D, RawFrm, (outs), (ins i64i32imm:$src),
+ "sbb{q}\t{$src, %rax|%rax, $src}", []>;
+
+def SBB64mr : RI<0x19, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "sbb{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def SBB64mi8 : RIi8<0x83, MRM3m, (outs), (ins i64mem:$dst, i64i8imm :$src2),
+ "sbb{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+def SBB64mi32 : RIi32<0x81, MRM3m, (outs), (ins i64mem:$dst, i64i32imm:$src2),
+ "sbb{q}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>;
+} // Uses = [EFLAGS]
+} // Defs = [EFLAGS]
+
+// Unsigned multiplication
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in {
+def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
+ "mul{q}\t$src", []>; // RAX,RDX = RAX*GR64
+let mayLoad = 1 in
+def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
+ "mul{q}\t$src", []>; // RAX,RDX = RAX*[mem64]
+
+// Signed multiplication
+def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src),
+ "imul{q}\t$src", []>; // RAX,RDX = RAX*GR64
+let mayLoad = 1 in
+def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
+ "imul{q}\t$src", []>; // RAX,RDX = RAX*[mem64]
+}
+
+let Defs = [EFLAGS] in {
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+// Register-Register Signed Integer Multiplication
+def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "imul{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, GR64:$src2))]>, TB;
+
+// Register-Memory Signed Integer Multiplication
+def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "imul{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, (load addr:$src2)))]>, TB;
+} // isTwoAddress
+
+// Suprisingly enough, these are not two address instructions!
+
+// Register-Integer Signed Integer Multiplication
+def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8
+ (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, i64immSExt8:$src2))]>;
+def IMUL64rri32 : RIi32<0x69, MRMSrcReg, // GR64 = GR64*I32
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>;
+
+// Memory-Integer Signed Integer Multiplication
+def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
+ (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i64immSExt8:$src2))]>;
+def IMUL64rmi32 : RIi32<0x69, MRMSrcMem, // GR64 = [mem64]*I32
+ (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
+ "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i64immSExt32:$src2))]>;
+} // Defs = [EFLAGS]
+
+// Unsigned division / remainder
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in {
+// RDX:RAX/r64 = RAX,RDX
+def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
+ "div{q}\t$src", []>;
+// Signed division / remainder
+// RDX:RAX/r64 = RAX,RDX
+def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
+ "idiv{q}\t$src", []>;
+let mayLoad = 1 in {
+// RDX:RAX/[mem64] = RAX,RDX
+def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
+ "div{q}\t$src", []>;
+// RDX:RAX/[mem64] = RAX,RDX
+def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
+ "idiv{q}\t$src", []>;
+}
+}
+
+// Unary instructions
+let Defs = [EFLAGS], CodeSize = 2 in {
+let isTwoAddress = 1 in
+def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src), "neg{q}\t$dst",
+ [(set GR64:$dst, (ineg GR64:$src)),
+ (implicit EFLAGS)]>;
+def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
+ [(store (ineg (loadi64 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)]>;
+
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src), "inc{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src))]>;
+def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
+ [(store (add (loadi64 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>;
+
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src), "dec{q}\t$dst",
+ [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src))]>;
+def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+ [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>;
+
+// In 64-bit mode, single byte INC and DEC cannot be encoded.
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in {
+// Can transform into LEA.
+def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src),
+ "inc{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src))]>,
+ OpSize, Requires<[In64BitMode]>;
+def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src),
+ "inc{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src))]>,
+ Requires<[In64BitMode]>;
+def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src),
+ "dec{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src))]>,
+ OpSize, Requires<[In64BitMode]>;
+def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src),
+ "dec{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src))]>,
+ Requires<[In64BitMode]>;
+} // isConvertibleToThreeAddress
+
+// These are duplicates of their 32-bit counterparts. Only needed so X86 knows
+// how to unfold them.
+let isTwoAddress = 0, CodeSize = 2 in {
+ def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize, Requires<[In64BitMode]>;
+ def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>,
+ Requires<[In64BitMode]>;
+ def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize, Requires<[In64BitMode]>;
+ def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>,
+ Requires<[In64BitMode]>;
+}
+} // Defs = [EFLAGS], CodeSize
+
+
+let Defs = [EFLAGS] in {
+// Shift instructions
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src),
+ "shl{q}\t{%cl, $dst|$dst, %CL}",
+ [(set GR64:$dst, (shl GR64:$src, CL))]>;
+let isConvertibleToThreeAddress = 1 in // Can transform into LEA.
+def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst),
+ (ins GR64:$src1, i8imm:$src2),
+ "shl{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>;
+// NOTE: We don't include patterns for shifts of a register by one, because
+// 'add reg,reg' is cheaper.
+def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+ "shl{q}\t$dst", []>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
+ "shl{q}\t{%cl, $dst|$dst, %CL}",
+ [(store (shl (loadi64 addr:$dst), CL), addr:$dst)]>;
+def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src),
+ "shl{q}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
+ "shl{q}\t$dst",
+ [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src),
+ "shr{q}\t{%cl, $dst|$dst, %CL}",
+ [(set GR64:$dst, (srl GR64:$src, CL))]>;
+def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+ "shr{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>;
+def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+ "shr{q}\t$dst",
+ [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
+ "shr{q}\t{%cl, $dst|$dst, %CL}",
+ [(store (srl (loadi64 addr:$dst), CL), addr:$dst)]>;
+def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src),
+ "shr{q}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
+ "shr{q}\t$dst",
+ [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src),
+ "sar{q}\t{%cl, $dst|$dst, %CL}",
+ [(set GR64:$dst, (sra GR64:$src, CL))]>;
+def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst),
+ (ins GR64:$src1, i8imm:$src2),
+ "sar{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))]>;
+def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+ "sar{q}\t$dst",
+ [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
+ "sar{q}\t{%cl, $dst|$dst, %CL}",
+ [(store (sra (loadi64 addr:$dst), CL), addr:$dst)]>;
+def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src),
+ "sar{q}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
+ "sar{q}\t$dst",
+ [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+// Rotate instructions
+
+let isTwoAddress = 1 in {
+def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src),
+ "rcl{q}\t{1, $dst|$dst, 1}", []>;
+def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src, i8imm:$cnt),
+ "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+
+def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src),
+ "rcr{q}\t{1, $dst|$dst, 1}", []>;
+def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src, i8imm:$cnt),
+ "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+
+let Uses = [CL] in {
+def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src),
+ "rcl{q}\t{%cl, $dst|$dst, CL}", []>;
+def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src),
+ "rcr{q}\t{%cl, $dst|$dst, CL}", []>;
+}
+}
+
+let isTwoAddress = 0 in {
+def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
+ "rcl{q}\t{1, $dst|$dst, 1}", []>;
+def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, i8imm:$cnt),
+ "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
+ "rcr{q}\t{1, $dst|$dst, 1}", []>;
+def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt),
+ "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+
+let Uses = [CL] in {
+def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
+ "rcl{q}\t{%cl, $dst|$dst, CL}", []>;
+def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
+ "rcr{q}\t{%cl, $dst|$dst, CL}", []>;
+}
+}
+
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src),
+ "rol{q}\t{%cl, $dst|$dst, %CL}",
+ [(set GR64:$dst, (rotl GR64:$src, CL))]>;
+def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst),
+ (ins GR64:$src1, i8imm:$src2),
+ "rol{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>;
+def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+ "rol{q}\t$dst",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
+ "rol{q}\t{%cl, $dst|$dst, %CL}",
+ [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)]>;
+def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src),
+ "rol{q}\t{$src, $dst|$dst, $src}",
+ [(store (rotl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
+ "rol{q}\t$dst",
+ [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src),
+ "ror{q}\t{%cl, $dst|$dst, %CL}",
+ [(set GR64:$dst, (rotr GR64:$src, CL))]>;
+def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst),
+ (ins GR64:$src1, i8imm:$src2),
+ "ror{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))]>;
+def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+ "ror{q}\t$dst",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
+ "ror{q}\t{%cl, $dst|$dst, %CL}",
+ [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)]>;
+def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src),
+ "ror{q}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
+ "ror{q}\t$dst",
+ [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+// Double shift instructions (generalizations of rotate)
+let isTwoAddress = 1 in {
+let Uses = [CL] in {
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))]>,
+ TB;
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>,
+ TB;
+}
+
+let isCommutable = 1 in { // FIXME: Update X86InstrInfo::commuteInstruction
+def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
+ (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+ "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
+ (i8 imm:$src3)))]>,
+ TB;
+def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
+ (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+ "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
+ (i8 imm:$src3)))]>,
+ TB;
+} // isCommutable
+} // isTwoAddress
+
+let Uses = [CL] in {
+def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "shld{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
+ addr:$dst)]>, TB;
+def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
+ addr:$dst)]>, TB;
+}
+def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
+ (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+ "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB;
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
+ (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+ "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB;
+} // Defs = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+// Logical Instructions...
+//
+
+let isTwoAddress = 1 , AddedComplexity = 15 in
+def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src), "not{q}\t$dst",
+ [(set GR64:$dst, (not GR64:$src))]>;
+def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
+ [(store (not (loadi64 addr:$dst)), addr:$dst)]>;
+
+let Defs = [EFLAGS] in {
+def AND64i32 : RIi32<0x25, RawFrm, (outs), (ins i64i32imm:$src),
+ "and{q}\t{$src, %rax|%rax, $src}", []>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def AND64rr : RI<0x21, MRMDestReg,
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "and{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86and_flag GR64:$src1, GR64:$src2))]>;
+let isCodeGenOnly = 1 in {
+def AND64rr_REV : RI<0x23, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "and{q}\t{$src2, $dst|$dst, $src2}", []>;
+}
+def AND64rm : RI<0x23, MRMSrcMem,
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "and{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86and_flag GR64:$src1, (load addr:$src2)))]>;
+def AND64ri8 : RIi8<0x83, MRM4r,
+ (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+ "and{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86and_flag GR64:$src1, i64immSExt8:$src2))]>;
+def AND64ri32 : RIi32<0x81, MRM4r,
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "and{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86and_flag GR64:$src1, i64immSExt32:$src2))]>;
+} // isTwoAddress
+
+def AND64mr : RI<0x21, MRMDestMem,
+ (outs), (ins i64mem:$dst, GR64:$src),
+ "and{q}\t{$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), GR64:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+def AND64mi8 : RIi8<0x83, MRM4m,
+ (outs), (ins i64mem:$dst, i64i8imm :$src),
+ "and{q}\t{$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), i64immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+def AND64mi32 : RIi32<0x81, MRM4m,
+ (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "and{q}\t{$src, $dst|$dst, $src}",
+ [(store (and (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def OR64rr : RI<0x09, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "or{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86or_flag GR64:$src1, GR64:$src2))]>;
+let isCodeGenOnly = 1 in {
+def OR64rr_REV : RI<0x0B, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "or{q}\t{$src2, $dst|$dst, $src2}", []>;
+}
+def OR64rm : RI<0x0B, MRMSrcMem , (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "or{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86or_flag GR64:$src1, (load addr:$src2)))]>;
+def OR64ri8 : RIi8<0x83, MRM1r, (outs GR64:$dst),
+ (ins GR64:$src1, i64i8imm:$src2),
+ "or{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86or_flag GR64:$src1, i64immSExt8:$src2))]>;
+def OR64ri32 : RIi32<0x81, MRM1r, (outs GR64:$dst),
+ (ins GR64:$src1, i64i32imm:$src2),
+ "or{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86or_flag GR64:$src1, i64immSExt32:$src2))]>;
+} // isTwoAddress
+
+def OR64mr : RI<0x09, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "or{q}\t{$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), GR64:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+def OR64mi8 : RIi8<0x83, MRM1m, (outs), (ins i64mem:$dst, i64i8imm:$src),
+ "or{q}\t{$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), i64immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+def OR64mi32 : RIi32<0x81, MRM1m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "or{q}\t{$src, $dst|$dst, $src}",
+ [(store (or (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+
+def OR64i32 : RIi32<0x0D, RawFrm, (outs), (ins i64i32imm:$src),
+ "or{q}\t{$src, %rax|%rax, $src}", []>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def XOR64rr : RI<0x31, MRMDestReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "xor{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86xor_flag GR64:$src1, GR64:$src2))]>;
+let isCodeGenOnly = 1 in {
+def XOR64rr_REV : RI<0x33, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "xor{q}\t{$src2, $dst|$dst, $src2}", []>;
+}
+def XOR64rm : RI<0x33, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "xor{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86xor_flag GR64:$src1, (load addr:$src2)))]>;
+def XOR64ri8 : RIi8<0x83, MRM6r, (outs GR64:$dst),
+ (ins GR64:$src1, i64i8imm:$src2),
+ "xor{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86xor_flag GR64:$src1, i64immSExt8:$src2))]>;
+def XOR64ri32 : RIi32<0x81, MRM6r,
+ (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+ "xor{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, EFLAGS,
+ (X86xor_flag GR64:$src1, i64immSExt32:$src2))]>;
+} // isTwoAddress
+
+def XOR64mr : RI<0x31, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "xor{q}\t{$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), GR64:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+def XOR64mi8 : RIi8<0x83, MRM6m, (outs), (ins i64mem:$dst, i64i8imm :$src),
+ "xor{q}\t{$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), i64immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+def XOR64mi32 : RIi32<0x81, MRM6m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+ "xor{q}\t{$src, $dst|$dst, $src}",
+ [(store (xor (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+
+def XOR64i32 : RIi32<0x35, RawFrm, (outs), (ins i64i32imm:$src),
+ "xor{q}\t{$src, %rax|%rax, $src}", []>;
+
+} // Defs = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+// Comparison Instructions...
+//
+
+// Integer comparison
+let Defs = [EFLAGS] in {
+def TEST64i32 : RIi32<0xa9, RawFrm, (outs), (ins i64i32imm:$src),
+ "test{q}\t{$src, %rax|%rax, $src}", []>;
+let isCommutable = 1 in
+def TEST64rr : RI<0x85, MRMSrcReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "test{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and GR64:$src1, GR64:$src2), 0))]>;
+def TEST64rm : RI<0x85, MRMSrcMem, (outs), (ins GR64:$src1, i64mem:$src2),
+ "test{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and GR64:$src1, (loadi64 addr:$src2)),
+ 0))]>;
+def TEST64ri32 : RIi32<0xF7, MRM0r, (outs),
+ (ins GR64:$src1, i64i32imm:$src2),
+ "test{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and GR64:$src1, i64immSExt32:$src2),
+ 0))]>;
+def TEST64mi32 : RIi32<0xF7, MRM0m, (outs),
+ (ins i64mem:$src1, i64i32imm:$src2),
+ "test{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and (loadi64 addr:$src1),
+ i64immSExt32:$src2), 0))]>;
+
+
+def CMP64i32 : RIi32<0x3D, RawFrm, (outs), (ins i64i32imm:$src),
+ "cmp{q}\t{$src, %rax|%rax, $src}", []>;
+def CMP64rr : RI<0x39, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "cmp{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp GR64:$src1, GR64:$src2))]>;
+
+// These are alternate spellings for use by the disassembler, we mark them as
+// code gen only to ensure they aren't matched by the assembler.
+let isCodeGenOnly = 1 in {
+ def CMP64mrmrr : RI<0x3B, MRMSrcReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "cmp{q}\t{$src2, $src1|$src1, $src2}", []>;
+}
+
+def CMP64mr : RI<0x39, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "cmp{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (loadi64 addr:$src1), GR64:$src2))]>;
+def CMP64rm : RI<0x3B, MRMSrcMem, (outs), (ins GR64:$src1, i64mem:$src2),
+ "cmp{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp GR64:$src1, (loadi64 addr:$src2)))]>;
+def CMP64ri8 : RIi8<0x83, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "cmp{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp GR64:$src1, i64immSExt8:$src2))]>;
+def CMP64ri32 : RIi32<0x81, MRM7r, (outs), (ins GR64:$src1, i64i32imm:$src2),
+ "cmp{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp GR64:$src1, i64immSExt32:$src2))]>;
+def CMP64mi8 : RIi8<0x83, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "cmp{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (loadi64 addr:$src1),
+ i64immSExt8:$src2))]>;
+def CMP64mi32 : RIi32<0x81, MRM7m, (outs),
+ (ins i64mem:$src1, i64i32imm:$src2),
+ "cmp{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (loadi64 addr:$src1),
+ i64immSExt32:$src2))]>;
+} // Defs = [EFLAGS]
+
+// Bit tests.
+// TODO: BTC, BTR, and BTS
+let Defs = [EFLAGS] in {
+def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))]>, TB;
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Disable these instructions for now.
+def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+// [(X86bt (loadi64 addr:$src1), GR64:$src2),
+// (implicit EFLAGS)]
+ []
+ >, TB;
+
+def BT64ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB,
+ REX_W;
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+def BT64mi8 : Ii8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi64 addr:$src1),
+ i64immSExt8:$src2))]>, TB;
+
+def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+
+def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+
+def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // Defs = [EFLAGS]
+
+// Conditional moves
+let Uses = [EFLAGS], isTwoAddress = 1 in {
+let isCommutable = 1 in {
+def CMOVB64rr : RI<0x42, MRMSrcReg, // if <u, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovb{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_B, EFLAGS))]>, TB;
+def CMOVAE64rr: RI<0x43, MRMSrcReg, // if >=u, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovae{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_AE, EFLAGS))]>, TB;
+def CMOVE64rr : RI<0x44, MRMSrcReg, // if ==, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmove{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_E, EFLAGS))]>, TB;
+def CMOVNE64rr: RI<0x45, MRMSrcReg, // if !=, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovne{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_NE, EFLAGS))]>, TB;
+def CMOVBE64rr: RI<0x46, MRMSrcReg, // if <=u, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovbe{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_BE, EFLAGS))]>, TB;
+def CMOVA64rr : RI<0x47, MRMSrcReg, // if >u, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmova{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_A, EFLAGS))]>, TB;
+def CMOVL64rr : RI<0x4C, MRMSrcReg, // if <s, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovl{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_L, EFLAGS))]>, TB;
+def CMOVGE64rr: RI<0x4D, MRMSrcReg, // if >=s, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovge{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_GE, EFLAGS))]>, TB;
+def CMOVLE64rr: RI<0x4E, MRMSrcReg, // if <=s, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovle{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_LE, EFLAGS))]>, TB;
+def CMOVG64rr : RI<0x4F, MRMSrcReg, // if >s, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovg{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_G, EFLAGS))]>, TB;
+def CMOVS64rr : RI<0x48, MRMSrcReg, // if signed, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovs{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_S, EFLAGS))]>, TB;
+def CMOVNS64rr: RI<0x49, MRMSrcReg, // if !signed, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovns{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_NS, EFLAGS))]>, TB;
+def CMOVP64rr : RI<0x4A, MRMSrcReg, // if parity, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovp{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_P, EFLAGS))]>, TB;
+def CMOVNP64rr : RI<0x4B, MRMSrcReg, // if !parity, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovnp{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_NP, EFLAGS))]>, TB;
+def CMOVO64rr : RI<0x40, MRMSrcReg, // if overflow, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovo{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_O, EFLAGS))]>, TB;
+def CMOVNO64rr : RI<0x41, MRMSrcReg, // if !overflow, GR64 = GR64
+ (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "cmovno{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_NO, EFLAGS))]>, TB;
+} // isCommutable = 1
+
+def CMOVB64rm : RI<0x42, MRMSrcMem, // if <u, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovb{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_B, EFLAGS))]>, TB;
+def CMOVAE64rm: RI<0x43, MRMSrcMem, // if >=u, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovae{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_AE, EFLAGS))]>, TB;
+def CMOVE64rm : RI<0x44, MRMSrcMem, // if ==, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmove{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_E, EFLAGS))]>, TB;
+def CMOVNE64rm: RI<0x45, MRMSrcMem, // if !=, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovne{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_NE, EFLAGS))]>, TB;
+def CMOVBE64rm: RI<0x46, MRMSrcMem, // if <=u, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovbe{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_BE, EFLAGS))]>, TB;
+def CMOVA64rm : RI<0x47, MRMSrcMem, // if >u, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmova{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_A, EFLAGS))]>, TB;
+def CMOVL64rm : RI<0x4C, MRMSrcMem, // if <s, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovl{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_L, EFLAGS))]>, TB;
+def CMOVGE64rm: RI<0x4D, MRMSrcMem, // if >=s, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovge{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_GE, EFLAGS))]>, TB;
+def CMOVLE64rm: RI<0x4E, MRMSrcMem, // if <=s, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovle{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_LE, EFLAGS))]>, TB;
+def CMOVG64rm : RI<0x4F, MRMSrcMem, // if >s, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovg{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_G, EFLAGS))]>, TB;
+def CMOVS64rm : RI<0x48, MRMSrcMem, // if signed, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovs{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_S, EFLAGS))]>, TB;
+def CMOVNS64rm: RI<0x49, MRMSrcMem, // if !signed, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovns{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_NS, EFLAGS))]>, TB;
+def CMOVP64rm : RI<0x4A, MRMSrcMem, // if parity, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovp{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_P, EFLAGS))]>, TB;
+def CMOVNP64rm : RI<0x4B, MRMSrcMem, // if !parity, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovnp{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_NP, EFLAGS))]>, TB;
+def CMOVO64rm : RI<0x40, MRMSrcMem, // if overflow, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovo{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_O, EFLAGS))]>, TB;
+def CMOVNO64rm : RI<0x41, MRMSrcMem, // if !overflow, GR64 = [mem64]
+ (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+ "cmovno{q}\t{$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_NO, EFLAGS))]>, TB;
+} // isTwoAddress
+
+// Use sbb to materialize carry flag into a GPR.
+// FIXME: This are pseudo ops that should be replaced with Pat<> patterns.
+// However, Pat<> can't replicate the destination reg into the inputs of the
+// result.
+// FIXME: Change this to have encoding Pseudo when X86MCCodeEmitter replaces
+// X86CodeEmitter.
+let Defs = [EFLAGS], Uses = [EFLAGS], isCodeGenOnly = 1 in
+def SETB_C64r : RI<0x19, MRMInitReg, (outs GR64:$dst), (ins), "",
+ [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+
+def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C64r)>;
+
+//===----------------------------------------------------------------------===//
+// Conversion Instructions...
+//
+
+// f64 -> signed i64
+def CVTSD2SI64rr: RSDI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins FR64:$src),
+ "cvtsd2si{q}\t{$src, $dst|$dst, $src}", []>;
+def CVTSD2SI64rm: RSDI<0x2D, MRMSrcMem, (outs GR64:$dst), (ins f64mem:$src),
+ "cvtsd2si{q}\t{$src, $dst|$dst, $src}", []>;
+def Int_CVTSD2SI64rr: RSDI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+ "cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst,
+ (int_x86_sse2_cvtsd2si64 VR128:$src))]>;
+def Int_CVTSD2SI64rm: RSDI<0x2D, MRMSrcMem, (outs GR64:$dst),
+ (ins f128mem:$src),
+ "cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (int_x86_sse2_cvtsd2si64
+ (load addr:$src)))]>;
+def CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins FR64:$src),
+ "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (fp_to_sint FR64:$src))]>;
+def CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (outs GR64:$dst), (ins f64mem:$src),
+ "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
+def Int_CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+ "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst,
+ (int_x86_sse2_cvttsd2si64 VR128:$src))]>;
+def Int_CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (outs GR64:$dst),
+ (ins f128mem:$src),
+ "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst,
+ (int_x86_sse2_cvttsd2si64
+ (load addr:$src)))]>;
+
+// Signed i64 -> f64
+def CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+ "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (sint_to_fp GR64:$src))]>;
+def CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+ "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
+
+let isTwoAddress = 1 in {
+def Int_CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, GR64:$src2),
+ "cvtsi2sd{q}\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtsi642sd VR128:$src1,
+ GR64:$src2))]>;
+def Int_CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
+ "cvtsi2sd{q}\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvtsi642sd VR128:$src1,
+ (loadi64 addr:$src2)))]>;
+} // isTwoAddress
+
+// Signed i64 -> f32
+def CVTSI2SS64rr: RSSI<0x2A, MRMSrcReg, (outs FR32:$dst), (ins GR64:$src),
+ "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (sint_to_fp GR64:$src))]>;
+def CVTSI2SS64rm: RSSI<0x2A, MRMSrcMem, (outs FR32:$dst), (ins i64mem:$src),
+ "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
+
+let isTwoAddress = 1 in {
+ def Int_CVTSI2SS64rr : RSSI<0x2A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, GR64:$src2),
+ "cvtsi2ss{q}\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse_cvtsi642ss VR128:$src1,
+ GR64:$src2))]>;
+ def Int_CVTSI2SS64rm : RSSI<0x2A, MRMSrcMem,
+ (outs VR128:$dst),
+ (ins VR128:$src1, i64mem:$src2),
+ "cvtsi2ss{q}\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (int_x86_sse_cvtsi642ss VR128:$src1,
+ (loadi64 addr:$src2)))]>;
+}
+
+// f32 -> signed i64
+def CVTSS2SI64rr: RSSI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src),
+ "cvtss2si{q}\t{$src, $dst|$dst, $src}", []>;
+def CVTSS2SI64rm: RSSI<0x2D, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src),
+ "cvtss2si{q}\t{$src, $dst|$dst, $src}", []>;
+def Int_CVTSS2SI64rr: RSSI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+ "cvtss2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst,
+ (int_x86_sse_cvtss2si64 VR128:$src))]>;
+def Int_CVTSS2SI64rm: RSSI<0x2D, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src),
+ "cvtss2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (int_x86_sse_cvtss2si64
+ (load addr:$src)))]>;
+def CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src),
+ "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (fp_to_sint FR32:$src))]>;
+def CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src),
+ "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
+def Int_CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+ "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst,
+ (int_x86_sse_cvttss2si64 VR128:$src))]>;
+def Int_CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (outs GR64:$dst),
+ (ins f32mem:$src),
+ "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst,
+ (int_x86_sse_cvttss2si64 (load addr:$src)))]>;
+
+// Descriptor-table support instructions
+
+// LLDT is not interpreted specially in 64-bit mode because there is no sign
+// extension.
+def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
+ "sldt{q}\t$dst", []>, TB;
+def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins),
+ "sldt{q}\t$dst", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// We want to rewrite MOV64r0 in terms of MOV32r0, because it's sometimes a
+// smaller encoding, but doing so at isel time interferes with rematerialization
+// in the current register allocator. For now, this is rewritten when the
+// instruction is lowered to an MCInst.
+// FIXME: AddedComplexity gives this a higher priority than MOV64ri32. Remove
+// when we have a better way to specify isel priority.
+let Defs = [EFLAGS],
+ AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOV64r0 : I<0x31, MRMInitReg, (outs GR64:$dst), (ins), "",
+ [(set GR64:$dst, 0)]>;
+
+// Materialize i64 constant where top 32-bits are zero. This could theoretically
+// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
+// that would make it more difficult to rematerialize.
+let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src),
+ "", [(set GR64:$dst, i64immZExt32:$src)]>;
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//===----------------------------------------------------------------------===//
+
+// All calls clobber the non-callee saved registers. RSP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ Uses = [RSP] in
+def TLS_addr64 : I<0, Pseudo, (outs), (ins lea64mem:$sym),
+ ".byte\t0x66; "
+ "leaq\t$sym(%rip), %rdi; "
+ ".word\t0x6666; "
+ "rex64; "
+ "call\t__tls_get_addr@PLT",
+ [(X86tlsaddr tls64addr:$sym)]>,
+ Requires<[In64BitMode]>;
+
+let AddedComplexity = 5, isCodeGenOnly = 1 in
+def MOV64GSrm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "movq\t%gs:$src, $dst",
+ [(set GR64:$dst, (gsload addr:$src))]>, SegGS;
+
+let AddedComplexity = 5, isCodeGenOnly = 1 in
+def MOV64FSrm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "movq\t%fs:$src, $dst",
+ [(set GR64:$dst, (fsload addr:$src))]>, SegFS;
+
+//===----------------------------------------------------------------------===//
+// Atomic Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [RAX, EFLAGS], Uses = [RAX] in {
+def LCMPXCHG64 : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$ptr, GR64:$swap),
+ "lock\n\t"
+ "cmpxchgq\t$swap,$ptr",
+ [(X86cas addr:$ptr, GR64:$swap, 8)]>, TB, LOCK;
+}
+
+let Constraints = "$val = $dst" in {
+let Defs = [EFLAGS] in
+def LXADD64 : RI<0xC1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$val,i64mem:$ptr),
+ "lock\n\t"
+ "xadd\t$val, $ptr",
+ [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))]>,
+ TB, LOCK;
+
+def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$val,i64mem:$ptr),
+ "xchg{q}\t{$val, $ptr|$ptr, $val}",
+ [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))]>;
+
+def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
+ "xchg{q}\t{$val, $src|$src, $val}", []>;
+}
+
+def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "xadd{q}\t{$src, $dst|$dst, $src}", []>, TB;
+let mayLoad = 1, mayStore = 1 in
+def XADD64rm : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "xadd{q}\t{$src, $dst|$dst, $src}", []>, TB;
+
+def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB;
+let mayLoad = 1, mayStore = 1 in
+def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB;
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
+def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
+ "cmpxchg16b\t$dst", []>, TB;
+
+def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
+ "xchg{q}\t{$src, %rax|%rax, $src}", []>;
+
+// Optimized codegen when the non-memory output is not used.
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1 in {
+// FIXME: Use normal add / sub instructions and add lock prefix dynamically.
+def LOCK_ADD64mr : RI<0x03, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "lock\n\t"
+ "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD64mi8 : RIi8<0x83, MRM0m, (outs),
+ (ins i64mem:$dst, i64i8imm :$src2),
+ "lock\n\t"
+ "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD64mi32 : RIi32<0x81, MRM0m, (outs),
+ (ins i64mem:$dst, i64i32imm :$src2),
+ "lock\n\t"
+ "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB64mr : RI<0x29, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ "lock\n\t"
+ "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB64mi8 : RIi8<0x83, MRM5m, (outs),
+ (ins i64mem:$dst, i64i8imm :$src2),
+ "lock\n\t"
+ "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB64mi32 : RIi32<0x81, MRM5m, (outs),
+ (ins i64mem:$dst, i64i32imm:$src2),
+ "lock\n\t"
+ "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst),
+ "lock\n\t"
+ "inc{q}\t$dst", []>, LOCK;
+def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst),
+ "lock\n\t"
+ "dec{q}\t$dst", []>, LOCK;
+}
+// Atomic exchange, and, or, xor
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+ usesCustomInserter = 1 in {
+def ATOMAND64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+ "#ATOMAND64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_and_64 addr:$ptr, GR64:$val))]>;
+def ATOMOR64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+ "#ATOMOR64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_or_64 addr:$ptr, GR64:$val))]>;
+def ATOMXOR64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+ "#ATOMXOR64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_xor_64 addr:$ptr, GR64:$val))]>;
+def ATOMNAND64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+ "#ATOMNAND64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_nand_64 addr:$ptr, GR64:$val))]>;
+def ATOMMIN64: I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$ptr, GR64:$val),
+ "#ATOMMIN64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_min_64 addr:$ptr, GR64:$val))]>;
+def ATOMMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+ "#ATOMMAX64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_max_64 addr:$ptr, GR64:$val))]>;
+def ATOMUMIN64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+ "#ATOMUMIN64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_umin_64 addr:$ptr, GR64:$val))]>;
+def ATOMUMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+ "#ATOMUMAX64 PSEUDO!",
+ [(set GR64:$dst, (atomic_load_umax_64 addr:$ptr, GR64:$val))]>;
+}
+
+// Segmentation support instructions
+
+// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
+def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+ "lar{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+ "lar{q}\t{$src, $dst|$dst, $src}", []>, TB;
+
+def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB;
+
+def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", []>, TB;
+
+def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
+ "push{q}\t%fs", []>, TB;
+def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
+ "push{q}\t%gs", []>, TB;
+
+def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
+ "pop{q}\t%fs", []>, TB;
+def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
+ "pop{q}\t%gs", []>, TB;
+
+def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+ "lss{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+ "lfs{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+ "lgs{q}\t{$src, $dst|$dst, $src}", []>, TB;
+
+// Specialized register support
+
+// no m form encodable; use SMSW16m
+def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins),
+ "smsw{q}\t$dst", []>, TB;
+
+// String manipulation instructions
+
+def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", []>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small
+// code model mode, should use 'movabs'. FIXME: This is really a hack, the
+// 'movabs' predicate should handle this sort of thing.
+def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
+ (MOV64ri tconstpool :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
+ (MOV64ri tjumptable :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+ (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+ (MOV64ri texternalsym:$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+ (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>;
+
+// In static codegen with small code model, we can get the address of a label
+// into a register with 'movl'. FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri64i32 should accept these.
+def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
+ (MOV64ri64i32 tconstpool :$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
+ (MOV64ri64i32 tjumptable :$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+ (MOV64ri64i32 tglobaladdr :$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+ (MOV64ri64i32 texternalsym:$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+ (MOV64ri64i32 tblockaddress:$dst)>, Requires<[SmallCode]>;
+
+// In kernel code model, we can get the address of a label
+// into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri32 should accept these.
+def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
+ (MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
+ (MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+ (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+ (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+ (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>;
+
+// If we have small model and -static mode, it is safe to store global addresses
+// directly as immediates. FIXME: This is really a hack, the 'imm' predicate
+// for MOV64mi32 should handle this sort of thing.
+def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tconstpool:$src)>,
+ Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tjumptable:$src)>,
+ Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
+ Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, texternalsym:$src)>,
+ Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tblockaddress:$src)>,
+ Requires<[NearData, IsStatic]>;
+
+// Calls
+// Direct PC relative function call for small code model. 32-bit displacement
+// sign extended to 64-bit.
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+ (CALL64pcrel32 tglobaladdr:$dst)>, Requires<[NotWin64]>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+ (CALL64pcrel32 texternalsym:$dst)>, Requires<[NotWin64]>;
+
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+ (WINCALL64pcrel32 tglobaladdr:$dst)>, Requires<[IsWin64]>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+ (WINCALL64pcrel32 texternalsym:$dst)>, Requires<[IsWin64]>;
+
+// tailcall stuff
+def : Pat<(X86tcret GR64_TC:$dst, imm:$off),
+ (TCRETURNri64 GR64_TC:$dst, imm:$off)>,
+ Requires<[In64BitMode]>;
+
+def : Pat<(X86tcret (load addr:$dst), imm:$off),
+ (TCRETURNmi64 addr:$dst, imm:$off)>,
+ Requires<[In64BitMode]>;
+
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
+ (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
+ Requires<[In64BitMode]>;
+
+def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
+ (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
+ Requires<[In64BitMode]>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(X86cmp GR64:$src1, 0),
+ (TEST64rr GR64:$src1, GR64:$src1)>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_B, EFLAGS),
+ (CMOVAE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_AE, EFLAGS),
+ (CMOVB64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_E, EFLAGS),
+ (CMOVNE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NE, EFLAGS),
+ (CMOVE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_BE, EFLAGS),
+ (CMOVA64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_A, EFLAGS),
+ (CMOVBE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_L, EFLAGS),
+ (CMOVGE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_GE, EFLAGS),
+ (CMOVL64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_LE, EFLAGS),
+ (CMOVG64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_G, EFLAGS),
+ (CMOVLE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_P, EFLAGS),
+ (CMOVNP64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NP, EFLAGS),
+ (CMOVP64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_S, EFLAGS),
+ (CMOVNS64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NS, EFLAGS),
+ (CMOVS64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_O, EFLAGS),
+ (CMOVNO64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NO, EFLAGS),
+ (CMOVO64rm GR64:$src2, addr:$src1)>;
+
+// zextload bool -> zextload byte
+def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+
+// extload
+// When extloading from 16-bit and smaller memory locations into 64-bit
+// registers, use zero-extending loads so that the entire 64-bit register is
+// defined, avoiding partial-register updates.
+def : Pat<(extloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+def : Pat<(extloadi64i8 addr:$src), (MOVZX64rm8 addr:$src)>;
+def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>;
+// For other extloads, use subregs, since the high contents of the register are
+// defined after an extload.
+def : Pat<(extloadi64i32 addr:$src),
+ (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src),
+ sub_32bit)>;
+
+// anyext. Define these to do an explicit zero-extend to
+// avoid partial-register updates.
+def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8 GR8 :$src)>;
+def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16 :$src)>;
+def : Pat<(i64 (anyext GR32:$src)),
+ (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR64:$src1, 128),
+ (SUB64ri8 GR64:$src1, -128)>;
+def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
+ (SUB64mi8 addr:$dst, -128)>;
+
+// The same trick applies for 32-bit immediate fields in 64-bit
+// instructions.
+def : Pat<(add GR64:$src1, 0x0000000080000000),
+ (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
+def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst),
+ (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
+
+// Use a 32-bit and with implicit zero-extension instead of a 64-bit and if it
+// has an immediate with at least 32 bits of leading zeros, to avoid needing to
+// materialize that immediate in a register first.
+def : Pat<(and GR64:$src, i64immZExt32:$imm),
+ (SUBREG_TO_REG
+ (i64 0),
+ (AND32ri
+ (EXTRACT_SUBREG GR64:$src, sub_32bit),
+ (i32 (GetLo32XForm imm:$imm))),
+ sub_32bit)>;
+
+// r & (2^32-1) ==> movz
+def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
+ (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
+// r & (2^16-1) ==> movz
+def : Pat<(and GR64:$src, 0xffff),
+ (MOVZX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit)))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR64:$src, 0xff),
+ (MOVZX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit)))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+ (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>,
+ Requires<[In64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+ (MOVZX16rr8 (i8 (EXTRACT_SUBREG GR16:$src1, sub_8bit)))>,
+ Requires<[In64BitMode]>;
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR64:$src, i32),
+ (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
+def : Pat<(sext_inreg GR64:$src, i16),
+ (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR64:$src, i8),
+ (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+ (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>,
+ Requires<[In64BitMode]>;
+def : Pat<(sext_inreg GR16:$src, i8),
+ (MOVSX16rr8 (i8 (EXTRACT_SUBREG GR16:$src, sub_8bit)))>,
+ Requires<[In64BitMode]>;
+
+// trunc patterns
+def : Pat<(i32 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, sub_32bit)>;
+def : Pat<(i16 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR64:$src)),
+ (EXTRACT_SUBREG GR64:$src, sub_8bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+ (EXTRACT_SUBREG GR32:$src, sub_8bit)>,
+ Requires<[In64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+ (EXTRACT_SUBREG GR16:$src, sub_8bit)>,
+ Requires<[In64BitMode]>;
+
+// h-register tricks.
+// For now, be conservative on x86-64 and use an h-register extract only if the
+// value is immediately zero-extended or stored, which are somewhat common
+// cases. This uses a bunch of code to prevent a register requiring a REX prefix
+// from being allocated in the same instruction as the h register, as there's
+// currently no way to describe this requirement to the register allocator.
+
+// h-register extract and zero-extend.
+def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+ sub_8bit_hi)),
+ sub_32bit)>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+ (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+ (EXTRACT_SUBREG
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_16bit)>,
+ Requires<[In64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_32bit)>;
+def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
+ (SUBREG_TO_REG
+ (i64 0),
+ (MOVZX32_NOREXrr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_32bit)>;
+
+// h-register extract and store.
+def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+ sub_8bit_hi))>;
+def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
+ (MOV8mr_NOREX
+ addr:$dst,
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In64BitMode]>;
+
+// (shl x, 1) ==> (add x, x)
+def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+
+// (shl x (and y, 63)) ==> (shl x, y)
+def : Pat<(shl GR64:$src1, (and CL, 63)),
+ (SHL64rCL GR64:$src1)>;
+def : Pat<(store (shl (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+ (SHL64mCL addr:$dst)>;
+
+def : Pat<(srl GR64:$src1, (and CL, 63)),
+ (SHR64rCL GR64:$src1)>;
+def : Pat<(store (srl (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+ (SHR64mCL addr:$dst)>;
+
+def : Pat<(sra GR64:$src1, (and CL, 63)),
+ (SAR64rCL GR64:$src1)>;
+def : Pat<(store (sra (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+ (SAR64mCL addr:$dst)>;
+
+// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
+let AddedComplexity = 5 in { // Try this before the selecting to OR
+def : Pat<(or_is_add GR64:$src1, i64immSExt8:$src2),
+ (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(or_is_add GR64:$src1, i64immSExt32:$src2),
+ (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(or_is_add GR64:$src1, GR64:$src2),
+ (ADD64rr GR64:$src1, GR64:$src2)>;
+} // AddedComplexity
+
+// X86 specific add which produces a flag.
+def : Pat<(addc GR64:$src1, GR64:$src2),
+ (ADD64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(addc GR64:$src1, (load addr:$src2)),
+ (ADD64rm GR64:$src1, addr:$src2)>;
+def : Pat<(addc GR64:$src1, i64immSExt8:$src2),
+ (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(addc GR64:$src1, i64immSExt32:$src2),
+ (ADD64ri32 GR64:$src1, imm:$src2)>;
+
+def : Pat<(subc GR64:$src1, GR64:$src2),
+ (SUB64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(subc GR64:$src1, (load addr:$src2)),
+ (SUB64rm GR64:$src1, addr:$src2)>;
+def : Pat<(subc GR64:$src1, i64immSExt8:$src2),
+ (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(subc GR64:$src1, imm:$src2),
+ (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// addition
+def : Pat<(add GR64:$src1, GR64:$src2),
+ (ADD64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt8:$src2),
+ (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt32:$src2),
+ (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
+ (ADD64rm GR64:$src1, addr:$src2)>;
+
+// subtraction
+def : Pat<(sub GR64:$src1, GR64:$src2),
+ (SUB64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
+ (SUB64rm GR64:$src1, addr:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
+ (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
+ (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Multiply
+def : Pat<(mul GR64:$src1, GR64:$src2),
+ (IMUL64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
+ (IMUL64rm GR64:$src1, addr:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
+ (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
+ (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
+ (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
+ (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
+
+// inc/dec
+def : Pat<(add GR16:$src, 1), (INC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR16:$src, -1), (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR32:$src, 1), (INC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR32:$src, -1), (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>;
+def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;
+
+// or
+def : Pat<(or GR64:$src1, GR64:$src2),
+ (OR64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt8:$src2),
+ (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt32:$src2),
+ (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
+ (OR64rm GR64:$src1, addr:$src2)>;
+
+// xor
+def : Pat<(xor GR64:$src1, GR64:$src2),
+ (XOR64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
+ (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
+ (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
+ (XOR64rm GR64:$src1, addr:$src2)>;
+
+// and
+def : Pat<(and GR64:$src1, GR64:$src2),
+ (AND64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt8:$src2),
+ (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt32:$src2),
+ (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
+ (AND64rm GR64:$src1, addr:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 SSE Instructions
+//===----------------------------------------------------------------------===//
+
+// Move instructions...
+
+def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector GR64:$src)))]>;
+def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+ (iPTR 0)))]>;
+
+def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert GR64:$src))]>;
+def MOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>;
+
+def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert FR64:$src))]>;
+def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 SSE4.1 Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
+ (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR64:$dst,
+ (extractelt (v2i64 VR128:$src1), imm:$src2))]>, OpSize, REX_W;
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
+ addr:$dst)]>, OpSize, REX_W;
+}
+
+defm PEXTRQ : SS41I_extract64<0x16, "pextrq">;
+
+let isTwoAddress = 1 in {
+ multiclass SS41I_insert64<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
+ OpSize, REX_W;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
+ imm:$src3)))]>, OpSize, REX_W;
+ }
+}
+
+defm PINSRQ : SS41I_insert64<0x22, "pinsrq">;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
new file mode 100644
index 0000000..5a82a7b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
@@ -0,0 +1,172 @@
+//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle X86'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call. X86 memory references
+// can be very complex expressions (described in the README), so wrapping them
+// up behind an easier to use interface makes sense. Descriptions of the
+// functions are included below.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Scale, Index, Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRBUILDER_H
+#define X86INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+namespace llvm {
+
+/// X86AddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with BP or SP and Disp being offsetted accordingly. The displacement may
+/// also include the offset of a global value.
+struct X86AddressMode {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ union {
+ unsigned Reg;
+ int FrameIndex;
+ } Base;
+
+ unsigned Scale;
+ unsigned IndexReg;
+ int Disp;
+ const GlobalValue *GV;
+ unsigned GVOpFlags;
+
+ X86AddressMode()
+ : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0), GVOpFlags(0) {
+ Base.Reg = 0;
+ }
+};
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no scale, index or displacement. An example is: DWORD PTR [EAX].
+///
+static inline const MachineInstrBuilder &
+addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
+ // Because memory references are always represented with four
+ // values, this adds: Reg, [1, NoReg, 0] to the instruction.
+ return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0);
+}
+
+static inline const MachineInstrBuilder &
+addLeaOffset(const MachineInstrBuilder &MIB, int Offset) {
+ return MIB.addImm(1).addReg(0).addImm(Offset);
+}
+
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, int Offset) {
+ return addLeaOffset(MIB, Offset).addReg(0);
+}
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no scale or index, but with a
+/// displacement. An example is: DWORD PTR [EAX + 4].
+///
+static inline const MachineInstrBuilder &
+addRegOffset(const MachineInstrBuilder &MIB,
+ unsigned Reg, bool isKill, int Offset) {
+ return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+static inline const MachineInstrBuilder &
+addLeaRegOffset(const MachineInstrBuilder &MIB,
+ unsigned Reg, bool isKill, int Offset) {
+ return addLeaOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+ unsigned Reg1, bool isKill1,
+ unsigned Reg2, bool isKill2) {
+ return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1)
+ .addReg(Reg2, getKillRegState(isKill2)).addImm(0);
+}
+
+static inline const MachineInstrBuilder &
+addLeaAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM) {
+ assert (AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+
+ if (AM.BaseType == X86AddressMode::RegBase)
+ MIB.addReg(AM.Base.Reg);
+ else if (AM.BaseType == X86AddressMode::FrameIndexBase)
+ MIB.addFrameIndex(AM.Base.FrameIndex);
+ else
+ assert (0);
+ MIB.addImm(AM.Scale).addReg(AM.IndexReg);
+ if (AM.GV)
+ return MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags);
+ else
+ return MIB.addImm(AM.Disp);
+}
+
+static inline const MachineInstrBuilder &
+addFullAddress(const MachineInstrBuilder &MIB,
+ const X86AddressMode &AM) {
+ return addLeaAddress(MIB, AM).addReg(0);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function. This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+ MachineInstr *MI = MIB;
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MachineFrameInfo &MFI = *MF.getFrameInfo();
+ const TargetInstrDesc &TID = MI->getDesc();
+ unsigned Flags = 0;
+ if (TID.mayLoad())
+ Flags |= MachineMemOperand::MOLoad;
+ if (TID.mayStore())
+ Flags |= MachineMemOperand::MOStore;
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+ Flags, Offset,
+ MFI.getObjectSize(FI),
+ MFI.getObjectAlignment(FI));
+ return addOffset(MIB.addFrameIndex(FI), Offset)
+ .addMemOperand(MMO);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool. The
+/// reference uses the abstract ConstantPoolIndex which is retained until
+/// either machine code emission or assembly output. In PIC mode on x86-32,
+/// the GlobalBaseReg parameter can be used to make this a
+/// GlobalBaseReg-relative reference.
+///
+static inline const MachineInstrBuilder &
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+ unsigned GlobalBaseReg, unsigned char OpFlags) {
+ //FIXME: factor this
+ return MIB.addReg(GlobalBaseReg).addImm(1).addReg(0)
+ .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
new file mode 100644
index 0000000..0aae4a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -0,0 +1,698 @@
+//==- X86InstrFPStack.td - Describe the X86 Instruction Set --*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 x87 FPU instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPStack specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>,
+ SDTCisVT<1, f80>]>;
+def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+
+def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld,
+ [SDNPHasChain, SDNPMayLoad]>;
+def X86fst : SDNode<"X86ISD::FST", SDTX86Fst,
+ [SDNPHasChain, SDNPInFlag, SDNPMayStore]>;
+def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild,
+ [SDNPHasChain, SDNPMayLoad]>;
+def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
+ [SDNPHasChain, SDNPOutFlag, SDNPMayLoad]>;
+def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain, SDNPMayStore]>;
+def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain, SDNPMayStore]>;
+def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain, SDNPMayStore]>;
+def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore,
+ [SDNPHasChain, SDNPMayStore, SDNPSideEffect]>;
+
+//===----------------------------------------------------------------------===//
+// FPStack pattern fragments
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(-0.0);
+}]>;
+
+def fpimm1 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(+1.0);
+}]>;
+
+def fpimmneg1 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(-1.0);
+}]>;
+
+// Some 'special' instructions
+let usesCustomInserter = 1 in { // Expanded after instruction selection.
+ def FP32_TO_INT16_IN_MEM : I<0, Pseudo,
+ (outs), (ins i16mem:$dst, RFP32:$src),
+ "##FP32_TO_INT16_IN_MEM PSEUDO!",
+ [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
+ def FP32_TO_INT32_IN_MEM : I<0, Pseudo,
+ (outs), (ins i32mem:$dst, RFP32:$src),
+ "##FP32_TO_INT32_IN_MEM PSEUDO!",
+ [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>;
+ def FP32_TO_INT64_IN_MEM : I<0, Pseudo,
+ (outs), (ins i64mem:$dst, RFP32:$src),
+ "##FP32_TO_INT64_IN_MEM PSEUDO!",
+ [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>;
+ def FP64_TO_INT16_IN_MEM : I<0, Pseudo,
+ (outs), (ins i16mem:$dst, RFP64:$src),
+ "##FP64_TO_INT16_IN_MEM PSEUDO!",
+ [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>;
+ def FP64_TO_INT32_IN_MEM : I<0, Pseudo,
+ (outs), (ins i32mem:$dst, RFP64:$src),
+ "##FP64_TO_INT32_IN_MEM PSEUDO!",
+ [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>;
+ def FP64_TO_INT64_IN_MEM : I<0, Pseudo,
+ (outs), (ins i64mem:$dst, RFP64:$src),
+ "##FP64_TO_INT64_IN_MEM PSEUDO!",
+ [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>;
+ def FP80_TO_INT16_IN_MEM : I<0, Pseudo,
+ (outs), (ins i16mem:$dst, RFP80:$src),
+ "##FP80_TO_INT16_IN_MEM PSEUDO!",
+ [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>;
+ def FP80_TO_INT32_IN_MEM : I<0, Pseudo,
+ (outs), (ins i32mem:$dst, RFP80:$src),
+ "##FP80_TO_INT32_IN_MEM PSEUDO!",
+ [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
+ def FP80_TO_INT64_IN_MEM : I<0, Pseudo,
+ (outs), (ins i64mem:$dst, RFP80:$src),
+ "##FP80_TO_INT64_IN_MEM PSEUDO!",
+ [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
+}
+
+let isTerminator = 1 in
+ let Defs = [FP0, FP1, FP2, FP3, FP4, FP5, FP6] in
+ def FP_REG_KILL : I<0, Pseudo, (outs), (ins), "##FP_REG_KILL", []>;
+
+// All FP Stack operations are represented with four instructions here. The
+// first three instructions, generated by the instruction selector, use "RFP32"
+// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit,
+// 64-bit or 80-bit floating point values. These sizes apply to the values,
+// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be
+// copied to each other without losing information. These instructions are all
+// pseudo instructions and use the "_Fp" suffix.
+// In some cases there are additional variants with a mixture of different
+// register sizes.
+// The second instruction is defined with FPI, which is the actual instruction
+// emitted by the assembler. These use "RST" registers, although frequently
+// the actual register(s) used are implicit. These are always 80 bits.
+// The FP stackifier pass converts one to the other after register allocation
+// occurs.
+//
+// Note that the FpI instruction should have instruction selection info (e.g.
+// a pattern) and the FPI instruction should have emission info (e.g. opcode
+// encoding and asm printing info).
+
+// Pseudo Instructions for FP stack return values.
+def FpGET_ST0_32 : FpI_<(outs RFP32:$dst), (ins), SpecialFP, []>; // FPR = ST(0)
+def FpGET_ST0_64 : FpI_<(outs RFP64:$dst), (ins), SpecialFP, []>; // FPR = ST(0)
+def FpGET_ST0_80 : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>; // FPR = ST(0)
+
+// FpGET_ST1* should only be issued *after* an FpGET_ST0* has been issued when
+// there are two values live out on the stack from a call or inlineasm. This
+// magic is handled by the stackifier. It is not valid to emit FpGET_ST1* and
+// then FpGET_ST0*. In addition, it is invalid for any FP-using operations to
+// occur between them.
+def FpGET_ST1_32 : FpI_<(outs RFP32:$dst), (ins), SpecialFP, []>; // FPR = ST(1)
+def FpGET_ST1_64 : FpI_<(outs RFP64:$dst), (ins), SpecialFP, []>; // FPR = ST(1)
+def FpGET_ST1_80 : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>; // FPR = ST(1)
+
+let Defs = [ST0] in {
+def FpSET_ST0_32 : FpI_<(outs), (ins RFP32:$src), SpecialFP, []>; // ST(0) = FPR
+def FpSET_ST0_64 : FpI_<(outs), (ins RFP64:$src), SpecialFP, []>; // ST(0) = FPR
+def FpSET_ST0_80 : FpI_<(outs), (ins RFP80:$src), SpecialFP, []>; // ST(0) = FPR
+}
+
+let Defs = [ST1] in {
+def FpSET_ST1_32 : FpI_<(outs), (ins RFP32:$src), SpecialFP, []>; // ST(1) = FPR
+def FpSET_ST1_64 : FpI_<(outs), (ins RFP64:$src), SpecialFP, []>; // ST(1) = FPR
+def FpSET_ST1_80 : FpI_<(outs), (ins RFP80:$src), SpecialFP, []>; // ST(1) = FPR
+}
+
+// FpIf32, FpIf64 - Floating Point Psuedo Instruction template.
+// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
+// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
+// f80 instructions cannot use SSE and use neither of these.
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
+
+// Register copies. Just copies, the shortening ones do not truncate.
+let neverHasSideEffects = 1 in {
+ def MOV_Fp3232 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), SpecialFP, []>;
+ def MOV_Fp3264 : FpIf32<(outs RFP64:$dst), (ins RFP32:$src), SpecialFP, []>;
+ def MOV_Fp6432 : FpIf32<(outs RFP32:$dst), (ins RFP64:$src), SpecialFP, []>;
+ def MOV_Fp6464 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), SpecialFP, []>;
+ def MOV_Fp8032 : FpIf32<(outs RFP32:$dst), (ins RFP80:$src), SpecialFP, []>;
+ def MOV_Fp3280 : FpIf32<(outs RFP80:$dst), (ins RFP32:$src), SpecialFP, []>;
+ def MOV_Fp8064 : FpIf64<(outs RFP64:$dst), (ins RFP80:$src), SpecialFP, []>;
+ def MOV_Fp6480 : FpIf64<(outs RFP80:$dst), (ins RFP64:$src), SpecialFP, []>;
+ def MOV_Fp8080 : FpI_ <(outs RFP80:$dst), (ins RFP80:$src), SpecialFP, []>;
+}
+
+// Factoring for arithmetic.
+multiclass FPBinary_rr<SDNode OpNode> {
+// Register op register -> register
+// These are separated out because they have no reversed form.
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP,
+ [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP,
+ [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
+ [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>;
+}
+// The FopST0 series are not included here because of the irregularities
+// in where the 'r' goes in assembly output.
+// These instructions cannot address 80-bit memory.
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
+// ST(0) = ST(0) + [mem]
+def _Fp32m : FpIf32<(outs RFP32:$dst),
+ (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
+ [(set RFP32:$dst,
+ (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
+def _Fp64m : FpIf64<(outs RFP64:$dst),
+ (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
+ [(set RFP64:$dst,
+ (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
+def _Fp64m32: FpIf64<(outs RFP64:$dst),
+ (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
+ [(set RFP64:$dst,
+ (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>;
+def _Fp80m32: FpI_<(outs RFP80:$dst),
+ (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
+ [(set RFP80:$dst,
+ (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>;
+def _Fp80m64: FpI_<(outs RFP80:$dst),
+ (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
+ [(set RFP80:$dst,
+ (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>;
+def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src),
+ !strconcat("f", !strconcat(asmstring, "{s}\t$src"))> {
+ let mayLoad = 1;
+}
+def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src),
+ !strconcat("f", !strconcat(asmstring, "{l}\t$src"))> {
+ let mayLoad = 1;
+}
+// ST(0) = ST(0) + [memint]
+def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2),
+ OneArgFPRW,
+ [(set RFP32:$dst, (OpNode RFP32:$src1,
+ (X86fild addr:$src2, i16)))]>;
+def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2),
+ OneArgFPRW,
+ [(set RFP32:$dst, (OpNode RFP32:$src1,
+ (X86fild addr:$src2, i32)))]>;
+def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2),
+ OneArgFPRW,
+ [(set RFP64:$dst, (OpNode RFP64:$src1,
+ (X86fild addr:$src2, i16)))]>;
+def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2),
+ OneArgFPRW,
+ [(set RFP64:$dst, (OpNode RFP64:$src1,
+ (X86fild addr:$src2, i32)))]>;
+def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2),
+ OneArgFPRW,
+ [(set RFP80:$dst, (OpNode RFP80:$src1,
+ (X86fild addr:$src2, i16)))]>;
+def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2),
+ OneArgFPRW,
+ [(set RFP80:$dst, (OpNode RFP80:$src1,
+ (X86fild addr:$src2, i32)))]>;
+def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src),
+ !strconcat("fi", !strconcat(asmstring, "{s}\t$src"))> {
+ let mayLoad = 1;
+}
+def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src),
+ !strconcat("fi", !strconcat(asmstring, "{l}\t$src"))> {
+ let mayLoad = 1;
+}
+}
+
+defm ADD : FPBinary_rr<fadd>;
+defm SUB : FPBinary_rr<fsub>;
+defm MUL : FPBinary_rr<fmul>;
+defm DIV : FPBinary_rr<fdiv>;
+defm ADD : FPBinary<fadd, MRM0m, "add">;
+defm SUB : FPBinary<fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
+defm MUL : FPBinary<fmul, MRM1m, "mul">;
+defm DIV : FPBinary<fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+
+class FPST0rInst<bits<8> o, string asm>
+ : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, D8;
+class FPrST0Inst<bits<8> o, string asm>
+ : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DC;
+class FPrST0PInst<bits<8> o, string asm>
+ : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DE;
+
+// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
+// of some of the 'reverse' forms of the fsub and fdiv instructions. As such,
+// we have to put some 'r's in and take them out of weird places.
+def ADD_FST0r : FPST0rInst <0xC0, "fadd\t$op">;
+def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, %ST(0)}">;
+def ADD_FPrST0 : FPrST0PInst<0xC0, "faddp\t$op">;
+def SUBR_FST0r : FPST0rInst <0xE8, "fsubr\t$op">;
+def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, %ST(0)}">;
+def SUB_FPrST0 : FPrST0PInst<0xE8, "fsub{r}p\t$op">;
+def SUB_FST0r : FPST0rInst <0xE0, "fsub\t$op">;
+def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, %ST(0)}">;
+def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">;
+def MUL_FST0r : FPST0rInst <0xC8, "fmul\t$op">;
+def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, %ST(0)}">;
+def MUL_FPrST0 : FPrST0PInst<0xC8, "fmulp\t$op">;
+def DIVR_FST0r : FPST0rInst <0xF8, "fdivr\t$op">;
+def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, %ST(0)}">;
+def DIV_FPrST0 : FPrST0PInst<0xF8, "fdiv{r}p\t$op">;
+def DIV_FST0r : FPST0rInst <0xF0, "fdiv\t$op">;
+def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, %ST(0)}">;
+def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">;
+
+def COM_FST0r : FPST0rInst <0xD0, "fcom\t$op">;
+def COMP_FST0r : FPST0rInst <0xD8, "fcomp\t$op">;
+
+// Unary operations.
+multiclass FPUnary<SDNode OpNode, bits<8> opcode, string asmstring> {
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
+ [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
+ [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
+ [(set RFP80:$dst, (OpNode RFP80:$src))]>;
+def _F : FPI<opcode, RawFrm, (outs), (ins), asmstring>, D9;
+}
+
+defm CHS : FPUnary<fneg, 0xE0, "fchs">;
+defm ABS : FPUnary<fabs, 0xE1, "fabs">;
+defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">;
+defm SIN : FPUnary<fsin, 0xFE, "fsin">;
+defm COS : FPUnary<fcos, 0xFF, "fcos">;
+
+let neverHasSideEffects = 1 in {
+def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
+def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
+def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
+}
+def TST_F : FPI<0xE4, RawFrm, (outs), (ins), "ftst">, D9;
+
+// Versions of FP instructions that take a single memory operand. Added for the
+// disassembler; remove as they are included with patterns elsewhere.
+def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
+def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
+
+def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
+def FSTENVm : FPI<0xD9, MRM6m, (outs f32mem:$dst), (ins), "fnstenv\t$dst">;
+
+def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">;
+def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
+
+def FCOM64m : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">;
+def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
+
+def FRSTORm : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">;
+def FSAVEm : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">;
+def FNSTSWm : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fnstsw\t$dst">;
+
+def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
+def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;
+
+def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f32mem:$src), "fbld\t$src">;
+def FBSTPm : FPI<0xDF, MRM6m, (outs f32mem:$dst), (ins), "fbstp\t$dst">;
+
+// Floating point cmovs.
+class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>;
+class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+ FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>;
+
+multiclass FPCMov<PatLeaf cc> {
+ def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
+ CondMovFP,
+ [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
+ cc, EFLAGS))]>;
+ def _Fp64 : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2),
+ CondMovFP,
+ [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
+ cc, EFLAGS))]>;
+ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
+ CondMovFP,
+ [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
+ cc, EFLAGS))]>,
+ Requires<[HasCMov]>;
+}
+
+let Uses = [EFLAGS], isTwoAddress = 1 in {
+defm CMOVB : FPCMov<X86_COND_B>;
+defm CMOVBE : FPCMov<X86_COND_BE>;
+defm CMOVE : FPCMov<X86_COND_E>;
+defm CMOVP : FPCMov<X86_COND_P>;
+defm CMOVNB : FPCMov<X86_COND_AE>;
+defm CMOVNBE: FPCMov<X86_COND_A>;
+defm CMOVNE : FPCMov<X86_COND_NE>;
+defm CMOVNP : FPCMov<X86_COND_NP>;
+}
+
+let Predicates = [HasCMov] in {
+// These are not factored because there's no clean way to pass DA/DB.
+def CMOVB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovb\t{$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovbe\t{$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
+ "fcmove\t{$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovu\t {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovnb\t{$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovnbe\t{$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovne\t{$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
+ "fcmovnu\t{$op, %st(0)|%ST(0), $op}">, DB;
+} // Predicates = [HasCMov]
+
+// Floating point loads & stores.
+let canFoldAsLoad = 1 in {
+def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (loadf32 addr:$src))]>;
+let isReMaterializable = 1 in
+ def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (loadf64 addr:$src))]>;
+def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (loadf80 addr:$src))]>;
+}
+def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>;
+def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>;
+def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
+def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
+ [(set RFP80:$dst, (X86fild addr:$src, i64))]>;
+
+def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
+ [(store RFP32:$src, addr:$op)]>;
+def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
+ [(truncstoref32 RFP64:$src, addr:$op)]>;
+def ST_Fp64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP,
+ [(store RFP64:$src, addr:$op)]>;
+def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP,
+ [(truncstoref32 RFP80:$src, addr:$op)]>;
+def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP,
+ [(truncstoref64 RFP80:$src, addr:$op)]>;
+// FST does not support 80-bit memory target; FSTP must be used.
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+def ST_FpP32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>;
+def ST_FpP80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
+ [(store RFP80:$src, addr:$op)]>;
+let mayStore = 1, neverHasSideEffects = 1 in {
+def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp16m64 : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64 : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+
+let mayLoad = 1 in {
+def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">;
+def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">;
+def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">;
+def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">;
+def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">;
+def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">;
+}
+let mayStore = 1 in {
+def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">;
+def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">;
+def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">;
+def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst">;
+def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst">;
+def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst">;
+def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst">;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst">;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst">;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">;
+}
+
+// FISTTP requires SSE3 even though it's a FPStack op.
+def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP32:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP32:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP32:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP64:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP64:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP64:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP80:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP80:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP80:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+
+let mayStore = 1 in {
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst),
+ "fisttp{ll}\t$dst">;
+}
+
+// FP Stack manipulation instructions.
+def LD_Frr : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op">, D9;
+def ST_Frr : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op">, DD;
+def ST_FPrr : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op">, DD;
+def XCH_F : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op">, D9;
+
+// Floating point constant loads.
+let isReMaterializable = 1 in {
+def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+ [(set RFP32:$dst, fpimm0)]>;
+def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+ [(set RFP32:$dst, fpimm1)]>;
+def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+ [(set RFP64:$dst, fpimm0)]>;
+def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+ [(set RFP64:$dst, fpimm1)]>;
+def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+ [(set RFP80:$dst, fpimm0)]>;
+def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+ [(set RFP80:$dst, fpimm1)]>;
+}
+
+def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz">, D9;
+def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1">, D9;
+
+
+// Floating point compares.
+let Defs = [EFLAGS] in {
+def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ []>; // FPSW = cmp ST(0) with ST(i)
+def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ []>; // FPSW = cmp ST(0) with ST(i)
+def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+ []>; // FPSW = cmp ST(0) with ST(i)
+
+// CC = ST(0) cmp ST(i)
+def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>;
+def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>;
+def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+ [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>;
+}
+
+let Defs = [EFLAGS], Uses = [ST0] in {
+def UCOM_Fr : FPI<0xE0, AddRegFrm, // FPSW = cmp ST(0) with ST(i)
+ (outs), (ins RST:$reg),
+ "fucom\t$reg">, DD;
+def UCOM_FPr : FPI<0xE8, AddRegFrm, // FPSW = cmp ST(0) with ST(i), pop
+ (outs), (ins RST:$reg),
+ "fucomp\t$reg">, DD;
+def UCOM_FPPr : FPI<0xE9, RawFrm, // cmp ST(0) with ST(1), pop, pop
+ (outs), (ins),
+ "fucompp">, DA;
+
+def UCOM_FIr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i)
+ (outs), (ins RST:$reg),
+ "fucomi\t{$reg, %st(0)|%ST(0), $reg}">, DB;
+def UCOM_FIPr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i), pop
+ (outs), (ins RST:$reg),
+ "fucomip\t{$reg, %st(0)|%ST(0), $reg}">, DF;
+}
+
+def COM_FIr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
+ "fcomi\t{$reg, %st(0)|%ST(0), $reg}">, DB;
+def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
+ "fcomip\t{$reg, %st(0)|%ST(0), $reg}">, DF;
+
+// Floating point flag ops.
+let Defs = [AX] in
+def FNSTSW8r : I<0xE0, RawFrm, // AX = fp flags
+ (outs), (ins), "fnstsw %ax", []>, DF;
+
+def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world
+ (outs), (ins i16mem:$dst), "fnstcw\t$dst",
+ [(X86fp_cwd_get16 addr:$dst)]>;
+
+let mayLoad = 1 in
+def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
+ (outs), (ins i16mem:$dst), "fldcw\t$dst", []>;
+
+// Register free
+
+def FFREE : FPI<0xC0, AddRegFrm, (outs), (ins RST:$reg),
+ "ffree\t$reg">, DD;
+
+// Clear exceptions
+
+def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", []>, DB;
+
+// Operandless floating-point instructions for the disassembler
+
+def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", []>, D9;
+def FXAM : I<0xE5, RawFrm, (outs), (ins), "fxam", []>, D9;
+def FLDL2T : I<0xE9, RawFrm, (outs), (ins), "fldl2t", []>, D9;
+def FLDL2E : I<0xEA, RawFrm, (outs), (ins), "fldl2e", []>, D9;
+def FLDPI : I<0xEB, RawFrm, (outs), (ins), "fldpi", []>, D9;
+def FLDLG2 : I<0xEC, RawFrm, (outs), (ins), "fldlg2", []>, D9;
+def FLDLN2 : I<0xED, RawFrm, (outs), (ins), "fldln2", []>, D9;
+def F2XM1 : I<0xF0, RawFrm, (outs), (ins), "f2xm1", []>, D9;
+def FYL2X : I<0xF1, RawFrm, (outs), (ins), "fyl2x", []>, D9;
+def FPTAN : I<0xF2, RawFrm, (outs), (ins), "fptan", []>, D9;
+def FPATAN : I<0xF3, RawFrm, (outs), (ins), "fpatan", []>, D9;
+def FXTRACT : I<0xF4, RawFrm, (outs), (ins), "fxtract", []>, D9;
+def FPREM1 : I<0xF5, RawFrm, (outs), (ins), "fprem1", []>, D9;
+def FDECSTP : I<0xF6, RawFrm, (outs), (ins), "fdecstp", []>, D9;
+def FINCSTP : I<0xF7, RawFrm, (outs), (ins), "fincstp", []>, D9;
+def FPREM : I<0xF8, RawFrm, (outs), (ins), "fprem", []>, D9;
+def FYL2XP1 : I<0xF9, RawFrm, (outs), (ins), "fyl2xp1", []>, D9;
+def FSINCOS : I<0xFB, RawFrm, (outs), (ins), "fsincos", []>, D9;
+def FRNDINT : I<0xFC, RawFrm, (outs), (ins), "frndint", []>, D9;
+def FSCALE : I<0xFD, RawFrm, (outs), (ins), "fscale", []>, D9;
+def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", []>, DE;
+
+def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
+ "fxsave\t$dst", []>, TB;
+def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+ "fxrstor\t$src", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Required for RET of f32 / f64 / f80 values.
+def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
+def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>;
+
+// Required for CALL which return f32 / f64 / f80 values.
+def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op,
+ RFP64:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op,
+ RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op,
+ RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op,
+ RFP80:$src)>;
+
+// Floating point constant -0.0 and -1.0
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>;
+def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
+def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
+
+// Used to conv. i64 to f64 since there isn't a SSE version.
+def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+
+// FP extensions map onto simple pseudo-value conversions if they are to/from
+// the FP stack.
+def : Pat<(f64 (fextend RFP32:$src)), (MOV_Fp3264 RFP32:$src)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP32:$src)), (MOV_Fp3280 RFP32:$src)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP64:$src)), (MOV_Fp6480 RFP64:$src)>,
+ Requires<[FPStackf64]>;
+
+// FP truncations map onto simple pseudo-value conversions if they are to/from
+// the FP stack. We have validated that only value-preserving truncations make
+// it through isel.
+def : Pat<(f32 (fround RFP64:$src)), (MOV_Fp6432 RFP64:$src)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f32 (fround RFP80:$src)), (MOV_Fp8032 RFP80:$src)>,
+ Requires<[FPStackf32]>;
+def : Pat<(f64 (fround RFP80:$src)), (MOV_Fp8064 RFP80:$src)>,
+ Requires<[FPStackf64]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
new file mode 100644
index 0000000..c4522f3
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
@@ -0,0 +1,400 @@
+//===- X86InstrFormats.td - X86 Instruction Formats --------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction. This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<6> val> {
+ bits<6> Value = val;
+}
+
+def Pseudo : Format<0>; def RawFrm : Format<1>;
+def AddRegFrm : Format<2>; def MRMDestReg : Format<3>;
+def MRMDestMem : Format<4>; def MRMSrcReg : Format<5>;
+def MRMSrcMem : Format<6>;
+def MRM0r : Format<16>; def MRM1r : Format<17>; def MRM2r : Format<18>;
+def MRM3r : Format<19>; def MRM4r : Format<20>; def MRM5r : Format<21>;
+def MRM6r : Format<22>; def MRM7r : Format<23>;
+def MRM0m : Format<24>; def MRM1m : Format<25>; def MRM2m : Format<26>;
+def MRM3m : Format<27>; def MRM4m : Format<28>; def MRM5m : Format<29>;
+def MRM6m : Format<30>; def MRM7m : Format<31>;
+def MRMInitReg : Format<32>;
+def MRM_C1 : Format<33>;
+def MRM_C2 : Format<34>;
+def MRM_C3 : Format<35>;
+def MRM_C4 : Format<36>;
+def MRM_C8 : Format<37>;
+def MRM_C9 : Format<38>;
+def MRM_E8 : Format<39>;
+def MRM_F0 : Format<40>;
+def MRM_F8 : Format<41>;
+def MRM_F9 : Format<42>;
+
+// ImmType - This specifies the immediate type used by an instruction. This is
+// part of the ad-hoc solution used to emit machine instruction encodings by our
+// machine code emitter.
+class ImmType<bits<3> val> {
+ bits<3> Value = val;
+}
+def NoImm : ImmType<0>;
+def Imm8 : ImmType<1>;
+def Imm8PCRel : ImmType<2>;
+def Imm16 : ImmType<3>;
+def Imm32 : ImmType<4>;
+def Imm32PCRel : ImmType<5>;
+def Imm64 : ImmType<6>;
+
+// FPFormat - This specifies what form this FP instruction has. This is used by
+// the Floating-Point stackifier pass.
+class FPFormat<bits<3> val> {
+ bits<3> Value = val;
+}
+def NotFP : FPFormat<0>;
+def ZeroArgFP : FPFormat<1>;
+def OneArgFP : FPFormat<2>;
+def OneArgFPRW : FPFormat<3>;
+def TwoArgFP : FPFormat<4>;
+def CompareFP : FPFormat<5>;
+def CondMovFP : FPFormat<6>;
+def SpecialFP : FPFormat<7>;
+
+// Class specifying the SSE execution domain, used by the SSEDomainFix pass.
+// Keep in sync with tables in X86InstrInfo.cpp.
+class Domain<bits<2> val> {
+ bits<2> Value = val;
+}
+def GenericDomain : Domain<0>;
+def SSEPackedSingle : Domain<1>;
+def SSEPackedDouble : Domain<2>;
+def SSEPackedInt : Domain<3>;
+
+// Prefix byte classes which are used to indicate to the ad-hoc machine code
+// emitter that various prefix bytes are required.
+class OpSize { bit hasOpSizePrefix = 1; }
+class AdSize { bit hasAdSizePrefix = 1; }
+class REX_W { bit hasREX_WPrefix = 1; }
+class LOCK { bit hasLockPrefix = 1; }
+class SegFS { bits<2> SegOvrBits = 1; }
+class SegGS { bits<2> SegOvrBits = 2; }
+class TB { bits<4> Prefix = 1; }
+class REP { bits<4> Prefix = 2; }
+class D8 { bits<4> Prefix = 3; }
+class D9 { bits<4> Prefix = 4; }
+class DA { bits<4> Prefix = 5; }
+class DB { bits<4> Prefix = 6; }
+class DC { bits<4> Prefix = 7; }
+class DD { bits<4> Prefix = 8; }
+class DE { bits<4> Prefix = 9; }
+class DF { bits<4> Prefix = 10; }
+class XD { bits<4> Prefix = 11; }
+class XS { bits<4> Prefix = 12; }
+class T8 { bits<4> Prefix = 13; }
+class TA { bits<4> Prefix = 14; }
+class TF { bits<4> Prefix = 15; }
+
+class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
+ string AsmStr, Domain d = GenericDomain>
+ : Instruction {
+ let Namespace = "X86";
+
+ bits<8> Opcode = opcod;
+ Format Form = f;
+ bits<6> FormBits = Form.Value;
+ ImmType ImmT = i;
+
+ dag OutOperandList = outs;
+ dag InOperandList = ins;
+ string AsmString = AsmStr;
+
+ //
+ // Attributes specific to X86 instructions...
+ //
+ bit hasOpSizePrefix = 0; // Does this inst have a 0x66 prefix?
+ bit hasAdSizePrefix = 0; // Does this inst have a 0x67 prefix?
+
+ bits<4> Prefix = 0; // Which prefix byte does this inst have?
+ bit hasREX_WPrefix = 0; // Does this inst requires the REX.W prefix?
+ FPFormat FPForm = NotFP; // What flavor of FP instruction is this?
+ bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix?
+ bits<2> SegOvrBits = 0; // Segment override prefix.
+ Domain ExeDomain = d;
+
+ // TSFlags layout should be kept in sync with X86InstrInfo.h.
+ let TSFlags{5-0} = FormBits;
+ let TSFlags{6} = hasOpSizePrefix;
+ let TSFlags{7} = hasAdSizePrefix;
+ let TSFlags{11-8} = Prefix;
+ let TSFlags{12} = hasREX_WPrefix;
+ let TSFlags{15-13} = ImmT.Value;
+ let TSFlags{18-16} = FPForm.Value;
+ let TSFlags{19} = hasLockPrefix;
+ let TSFlags{21-20} = SegOvrBits;
+ let TSFlags{23-22} = ExeDomain.Value;
+ let TSFlags{31-24} = Opcode;
+}
+
+class I<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d = GenericDomain>
+ : X86Inst<o, f, NoImm, outs, ins, asm, d> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern, Domain d = GenericDomain>
+ : X86Inst<o, f, Imm8, outs, ins, asm, d> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm8PCRel, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm16, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm32, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm32PCRel, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+// FPStack Instruction Templates:
+// FPI - Floating Point Instruction template.
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm>
+ : I<o, F, outs, ins, asm, []> {}
+
+// FpI_ - Floating Point Psuedo Instruction template. Not Predicated.
+class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern>
+ : X86Inst<0, Pseudo, NoImm, outs, ins, ""> {
+ let FPForm = fp;
+ let Pattern = pattern;
+}
+
+// Templates for instructions that use a 16- or 32-bit segmented address as
+// their only operand: lcall (FAR CALL) and ljmp (FAR JMP)
+//
+// Iseg16 - 16-bit segment selector, 16-bit offset
+// Iseg32 - 16-bit segment selector, 32-bit offset
+
+class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern> : X86Inst<o, f, NoImm, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern> : X86Inst<o, f, NoImm, outs, ins, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+// SSE1 Instruction Templates:
+//
+// SSI - SSE1 instructions with XS prefix.
+// PSI - SSE1 instructions with TB prefix.
+// PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
+
+class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>;
+class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>;
+class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
+ Requires<[HasSSE1]>;
+class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
+ Requires<[HasSSE1]>;
+
+// SSE2 Instruction Templates:
+//
+// SDI - SSE2 instructions with XD prefix.
+// SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix.
+// SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
+// PDI - SSE2 instructions with TB and OpSize prefixes.
+// PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
+
+class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE2]>;
+class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE2]>;
+class SSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize,
+ Requires<[HasSSE2]>;
+class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize,
+ Requires<[HasSSE2]>;
+
+// SSE3 Instruction Templates:
+//
+// S3I - SSE3 instructions with TB and OpSize prefixes.
+// S3SI - SSE3 instructions with XS prefix.
+// S3DI - SSE3 instructions with XD prefix.
+
+class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, XS,
+ Requires<[HasSSE3]>;
+class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, XD,
+ Requires<[HasSSE3]>;
+class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize,
+ Requires<[HasSSE3]>;
+
+
+// SSSE3 Instruction Templates:
+//
+// SS38I - SSSE3 instructions with T8 prefix.
+// SS3AI - SSSE3 instructions with TA prefix.
+//
+// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
+// uses the MMX registers. We put those instructions here because they better
+// fit into the SSSE3 instruction category rather than the MMX category.
+
+class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
+ Requires<[HasSSSE3]>;
+class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+ Requires<[HasSSSE3]>;
+
+// SSE4.1 Instruction Templates:
+//
+// SS48I - SSE 4.1 instructions with T8 prefix.
+// SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
+//
+class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
+ Requires<[HasSSE41]>;
+class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+ Requires<[HasSSE41]>;
+
+// SSE4.2 Instruction Templates:
+//
+// SS428I - SSE 4.2 instructions with T8 prefix.
+class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
+ Requires<[HasSSE42]>;
+
+// SS42FI - SSE 4.2 instructions with TF prefix.
+class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, TF, Requires<[HasSSE42]>;
+
+// SS42AI = SSE 4.2 instructions with TA prefix
+class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+ Requires<[HasSSE42]>;
+
+// AES Instruction Templates:
+//
+// AES8I
+// These use the same encoding as the SSE4.2 T8 and TA encodings.
+class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern>
+ : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
+ Requires<[HasAES]>;
+
+class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
+ Requires<[HasAES]>;
+
+// X86-64 Instruction templates...
+//
+
+class RI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, REX_W;
+class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, REX_W;
+class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii32<o, F, outs, ins, asm, pattern>, REX_W;
+
+class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : X86Inst<o, f, Imm64, outs, ins, asm>, REX_W {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : SSI<o, F, outs, ins, asm, pattern>, REX_W;
+class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : SDI<o, F, outs, ins, asm, pattern>, REX_W;
+class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : PDI<o, F, outs, ins, asm, pattern>, REX_W;
+
+// MMX Instruction templates
+//
+
+// MMXI - MMX instructions with TB prefix.
+// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
+// MMX2I - MMX / SSE2 instructions with TB and OpSize prefixes.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXID - MMX instructions with XD prefix.
+// MMXIS - MMX instructions with XS prefix.
+class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>;
+class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX,In64BitMode]>;
+class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, TB, REX_W, Requires<[HasMMX]>;
+class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasMMX]>;
+class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>;
+class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX]>;
+class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag> pattern>
+ : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
new file mode 100644
index 0000000..6b9478d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -0,0 +1,62 @@
+//======- X86InstrFragmentsSIMD.td - x86 ISA -------------*- tablegen -*-=====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides pattern fragments useful for SIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def load_mmx : PatFrag<(ops node:$ptr), (v1i64 (load node:$ptr))>;
+
+def bc_v8i8 : PatFrag<(ops node:$in), (v8i8 (bitconvert node:$in))>;
+def bc_v4i16 : PatFrag<(ops node:$in), (v4i16 (bitconvert node:$in))>;
+def bc_v2i32 : PatFrag<(ops node:$in), (v2i32 (bitconvert node:$in))>;
+def bc_v1i64 : PatFrag<(ops node:$in), (v1i64 (bitconvert node:$in))>;
+
+//===----------------------------------------------------------------------===//
+// MMX Masks
+//===----------------------------------------------------------------------===//
+
+// MMX_SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to
+// PSHUFW imm.
+def MMX_SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{
+ return getI8Imm(X86::getShuffleSHUFImmediate(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, v2, <2, 6, 3, 7, ...>
+def mmx_unpckh : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, v2, <0, 4, 2, 5, ...>
+def mmx_unpckl : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
+def mmx_unpckh_undef : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKH_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
+def mmx_unpckl_undef : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKL_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def mmx_pshufw : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
+}], MMX_SHUFFLE_get_shuf_imm>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
new file mode 100644
index 0000000..34e12ca
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -0,0 +1,3786 @@
+//===- X86InstrInfo.cpp - X86 Instruction Information -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86.h"
+#include "X86GenInstrInfo.inc"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/MC/MCAsmInfo.h"
+
+#include <limits>
+
+using namespace llvm;
+
+static cl::opt<bool>
+NoFusing("disable-spill-fusing",
+ cl::desc("Disable fusing of spill code into instructions"));
+static cl::opt<bool>
+PrintFailedFusing("print-failed-fuse-candidates",
+ cl::desc("Print instructions that the allocator wants to"
+ " fuse, but the X86 backend currently can't"),
+ cl::Hidden);
+static cl::opt<bool>
+ReMatPICStubLoad("remat-pic-stub-load",
+ cl::desc("Re-materialize load from stub in PIC mode"),
+ cl::init(false), cl::Hidden);
+
+X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
+ : TargetInstrInfoImpl(X86Insts, array_lengthof(X86Insts)),
+ TM(tm), RI(tm, *this) {
+ SmallVector<unsigned,16> AmbEntries;
+ static const unsigned OpTbl2Addr[][2] = {
+ { X86::ADC32ri, X86::ADC32mi },
+ { X86::ADC32ri8, X86::ADC32mi8 },
+ { X86::ADC32rr, X86::ADC32mr },
+ { X86::ADC64ri32, X86::ADC64mi32 },
+ { X86::ADC64ri8, X86::ADC64mi8 },
+ { X86::ADC64rr, X86::ADC64mr },
+ { X86::ADD16ri, X86::ADD16mi },
+ { X86::ADD16ri8, X86::ADD16mi8 },
+ { X86::ADD16rr, X86::ADD16mr },
+ { X86::ADD32ri, X86::ADD32mi },
+ { X86::ADD32ri8, X86::ADD32mi8 },
+ { X86::ADD32rr, X86::ADD32mr },
+ { X86::ADD64ri32, X86::ADD64mi32 },
+ { X86::ADD64ri8, X86::ADD64mi8 },
+ { X86::ADD64rr, X86::ADD64mr },
+ { X86::ADD8ri, X86::ADD8mi },
+ { X86::ADD8rr, X86::ADD8mr },
+ { X86::AND16ri, X86::AND16mi },
+ { X86::AND16ri8, X86::AND16mi8 },
+ { X86::AND16rr, X86::AND16mr },
+ { X86::AND32ri, X86::AND32mi },
+ { X86::AND32ri8, X86::AND32mi8 },
+ { X86::AND32rr, X86::AND32mr },
+ { X86::AND64ri32, X86::AND64mi32 },
+ { X86::AND64ri8, X86::AND64mi8 },
+ { X86::AND64rr, X86::AND64mr },
+ { X86::AND8ri, X86::AND8mi },
+ { X86::AND8rr, X86::AND8mr },
+ { X86::DEC16r, X86::DEC16m },
+ { X86::DEC32r, X86::DEC32m },
+ { X86::DEC64_16r, X86::DEC64_16m },
+ { X86::DEC64_32r, X86::DEC64_32m },
+ { X86::DEC64r, X86::DEC64m },
+ { X86::DEC8r, X86::DEC8m },
+ { X86::INC16r, X86::INC16m },
+ { X86::INC32r, X86::INC32m },
+ { X86::INC64_16r, X86::INC64_16m },
+ { X86::INC64_32r, X86::INC64_32m },
+ { X86::INC64r, X86::INC64m },
+ { X86::INC8r, X86::INC8m },
+ { X86::NEG16r, X86::NEG16m },
+ { X86::NEG32r, X86::NEG32m },
+ { X86::NEG64r, X86::NEG64m },
+ { X86::NEG8r, X86::NEG8m },
+ { X86::NOT16r, X86::NOT16m },
+ { X86::NOT32r, X86::NOT32m },
+ { X86::NOT64r, X86::NOT64m },
+ { X86::NOT8r, X86::NOT8m },
+ { X86::OR16ri, X86::OR16mi },
+ { X86::OR16ri8, X86::OR16mi8 },
+ { X86::OR16rr, X86::OR16mr },
+ { X86::OR32ri, X86::OR32mi },
+ { X86::OR32ri8, X86::OR32mi8 },
+ { X86::OR32rr, X86::OR32mr },
+ { X86::OR64ri32, X86::OR64mi32 },
+ { X86::OR64ri8, X86::OR64mi8 },
+ { X86::OR64rr, X86::OR64mr },
+ { X86::OR8ri, X86::OR8mi },
+ { X86::OR8rr, X86::OR8mr },
+ { X86::ROL16r1, X86::ROL16m1 },
+ { X86::ROL16rCL, X86::ROL16mCL },
+ { X86::ROL16ri, X86::ROL16mi },
+ { X86::ROL32r1, X86::ROL32m1 },
+ { X86::ROL32rCL, X86::ROL32mCL },
+ { X86::ROL32ri, X86::ROL32mi },
+ { X86::ROL64r1, X86::ROL64m1 },
+ { X86::ROL64rCL, X86::ROL64mCL },
+ { X86::ROL64ri, X86::ROL64mi },
+ { X86::ROL8r1, X86::ROL8m1 },
+ { X86::ROL8rCL, X86::ROL8mCL },
+ { X86::ROL8ri, X86::ROL8mi },
+ { X86::ROR16r1, X86::ROR16m1 },
+ { X86::ROR16rCL, X86::ROR16mCL },
+ { X86::ROR16ri, X86::ROR16mi },
+ { X86::ROR32r1, X86::ROR32m1 },
+ { X86::ROR32rCL, X86::ROR32mCL },
+ { X86::ROR32ri, X86::ROR32mi },
+ { X86::ROR64r1, X86::ROR64m1 },
+ { X86::ROR64rCL, X86::ROR64mCL },
+ { X86::ROR64ri, X86::ROR64mi },
+ { X86::ROR8r1, X86::ROR8m1 },
+ { X86::ROR8rCL, X86::ROR8mCL },
+ { X86::ROR8ri, X86::ROR8mi },
+ { X86::SAR16r1, X86::SAR16m1 },
+ { X86::SAR16rCL, X86::SAR16mCL },
+ { X86::SAR16ri, X86::SAR16mi },
+ { X86::SAR32r1, X86::SAR32m1 },
+ { X86::SAR32rCL, X86::SAR32mCL },
+ { X86::SAR32ri, X86::SAR32mi },
+ { X86::SAR64r1, X86::SAR64m1 },
+ { X86::SAR64rCL, X86::SAR64mCL },
+ { X86::SAR64ri, X86::SAR64mi },
+ { X86::SAR8r1, X86::SAR8m1 },
+ { X86::SAR8rCL, X86::SAR8mCL },
+ { X86::SAR8ri, X86::SAR8mi },
+ { X86::SBB32ri, X86::SBB32mi },
+ { X86::SBB32ri8, X86::SBB32mi8 },
+ { X86::SBB32rr, X86::SBB32mr },
+ { X86::SBB64ri32, X86::SBB64mi32 },
+ { X86::SBB64ri8, X86::SBB64mi8 },
+ { X86::SBB64rr, X86::SBB64mr },
+ { X86::SHL16rCL, X86::SHL16mCL },
+ { X86::SHL16ri, X86::SHL16mi },
+ { X86::SHL32rCL, X86::SHL32mCL },
+ { X86::SHL32ri, X86::SHL32mi },
+ { X86::SHL64rCL, X86::SHL64mCL },
+ { X86::SHL64ri, X86::SHL64mi },
+ { X86::SHL8rCL, X86::SHL8mCL },
+ { X86::SHL8ri, X86::SHL8mi },
+ { X86::SHLD16rrCL, X86::SHLD16mrCL },
+ { X86::SHLD16rri8, X86::SHLD16mri8 },
+ { X86::SHLD32rrCL, X86::SHLD32mrCL },
+ { X86::SHLD32rri8, X86::SHLD32mri8 },
+ { X86::SHLD64rrCL, X86::SHLD64mrCL },
+ { X86::SHLD64rri8, X86::SHLD64mri8 },
+ { X86::SHR16r1, X86::SHR16m1 },
+ { X86::SHR16rCL, X86::SHR16mCL },
+ { X86::SHR16ri, X86::SHR16mi },
+ { X86::SHR32r1, X86::SHR32m1 },
+ { X86::SHR32rCL, X86::SHR32mCL },
+ { X86::SHR32ri, X86::SHR32mi },
+ { X86::SHR64r1, X86::SHR64m1 },
+ { X86::SHR64rCL, X86::SHR64mCL },
+ { X86::SHR64ri, X86::SHR64mi },
+ { X86::SHR8r1, X86::SHR8m1 },
+ { X86::SHR8rCL, X86::SHR8mCL },
+ { X86::SHR8ri, X86::SHR8mi },
+ { X86::SHRD16rrCL, X86::SHRD16mrCL },
+ { X86::SHRD16rri8, X86::SHRD16mri8 },
+ { X86::SHRD32rrCL, X86::SHRD32mrCL },
+ { X86::SHRD32rri8, X86::SHRD32mri8 },
+ { X86::SHRD64rrCL, X86::SHRD64mrCL },
+ { X86::SHRD64rri8, X86::SHRD64mri8 },
+ { X86::SUB16ri, X86::SUB16mi },
+ { X86::SUB16ri8, X86::SUB16mi8 },
+ { X86::SUB16rr, X86::SUB16mr },
+ { X86::SUB32ri, X86::SUB32mi },
+ { X86::SUB32ri8, X86::SUB32mi8 },
+ { X86::SUB32rr, X86::SUB32mr },
+ { X86::SUB64ri32, X86::SUB64mi32 },
+ { X86::SUB64ri8, X86::SUB64mi8 },
+ { X86::SUB64rr, X86::SUB64mr },
+ { X86::SUB8ri, X86::SUB8mi },
+ { X86::SUB8rr, X86::SUB8mr },
+ { X86::XOR16ri, X86::XOR16mi },
+ { X86::XOR16ri8, X86::XOR16mi8 },
+ { X86::XOR16rr, X86::XOR16mr },
+ { X86::XOR32ri, X86::XOR32mi },
+ { X86::XOR32ri8, X86::XOR32mi8 },
+ { X86::XOR32rr, X86::XOR32mr },
+ { X86::XOR64ri32, X86::XOR64mi32 },
+ { X86::XOR64ri8, X86::XOR64mi8 },
+ { X86::XOR64rr, X86::XOR64mr },
+ { X86::XOR8ri, X86::XOR8mi },
+ { X86::XOR8rr, X86::XOR8mr }
+ };
+
+ for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) {
+ unsigned RegOp = OpTbl2Addr[i][0];
+ unsigned MemOp = OpTbl2Addr[i][1];
+ if (!RegOp2MemOpTable2Addr.insert(std::make_pair((unsigned*)RegOp,
+ std::make_pair(MemOp,0))).second)
+ assert(false && "Duplicated entries?");
+ // Index 0, folded load and store, no alignment requirement.
+ unsigned AuxInfo = 0 | (1 << 4) | (1 << 5);
+ if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+ std::make_pair(RegOp,
+ AuxInfo))).second)
+ AmbEntries.push_back(MemOp);
+ }
+
+ // If the third value is 1, then it's folding either a load or a store.
+ static const unsigned OpTbl0[][4] = {
+ { X86::BT16ri8, X86::BT16mi8, 1, 0 },
+ { X86::BT32ri8, X86::BT32mi8, 1, 0 },
+ { X86::BT64ri8, X86::BT64mi8, 1, 0 },
+ { X86::CALL32r, X86::CALL32m, 1, 0 },
+ { X86::CALL64r, X86::CALL64m, 1, 0 },
+ { X86::CMP16ri, X86::CMP16mi, 1, 0 },
+ { X86::CMP16ri8, X86::CMP16mi8, 1, 0 },
+ { X86::CMP16rr, X86::CMP16mr, 1, 0 },
+ { X86::CMP32ri, X86::CMP32mi, 1, 0 },
+ { X86::CMP32ri8, X86::CMP32mi8, 1, 0 },
+ { X86::CMP32rr, X86::CMP32mr, 1, 0 },
+ { X86::CMP64ri32, X86::CMP64mi32, 1, 0 },
+ { X86::CMP64ri8, X86::CMP64mi8, 1, 0 },
+ { X86::CMP64rr, X86::CMP64mr, 1, 0 },
+ { X86::CMP8ri, X86::CMP8mi, 1, 0 },
+ { X86::CMP8rr, X86::CMP8mr, 1, 0 },
+ { X86::DIV16r, X86::DIV16m, 1, 0 },
+ { X86::DIV32r, X86::DIV32m, 1, 0 },
+ { X86::DIV64r, X86::DIV64m, 1, 0 },
+ { X86::DIV8r, X86::DIV8m, 1, 0 },
+ { X86::EXTRACTPSrr, X86::EXTRACTPSmr, 0, 16 },
+ { X86::FsMOVAPDrr, X86::MOVSDmr, 0, 0 },
+ { X86::FsMOVAPSrr, X86::MOVSSmr, 0, 0 },
+ { X86::IDIV16r, X86::IDIV16m, 1, 0 },
+ { X86::IDIV32r, X86::IDIV32m, 1, 0 },
+ { X86::IDIV64r, X86::IDIV64m, 1, 0 },
+ { X86::IDIV8r, X86::IDIV8m, 1, 0 },
+ { X86::IMUL16r, X86::IMUL16m, 1, 0 },
+ { X86::IMUL32r, X86::IMUL32m, 1, 0 },
+ { X86::IMUL64r, X86::IMUL64m, 1, 0 },
+ { X86::IMUL8r, X86::IMUL8m, 1, 0 },
+ { X86::JMP32r, X86::JMP32m, 1, 0 },
+ { X86::JMP64r, X86::JMP64m, 1, 0 },
+ { X86::MOV16ri, X86::MOV16mi, 0, 0 },
+ { X86::MOV16rr, X86::MOV16mr, 0, 0 },
+ { X86::MOV32ri, X86::MOV32mi, 0, 0 },
+ { X86::MOV32rr, X86::MOV32mr, 0, 0 },
+ { X86::MOV32rr_TC, X86::MOV32mr_TC, 0, 0 },
+ { X86::MOV64ri32, X86::MOV64mi32, 0, 0 },
+ { X86::MOV64rr, X86::MOV64mr, 0, 0 },
+ { X86::MOV8ri, X86::MOV8mi, 0, 0 },
+ { X86::MOV8rr, X86::MOV8mr, 0, 0 },
+ { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, 0, 0 },
+ { X86::MOVAPDrr, X86::MOVAPDmr, 0, 16 },
+ { X86::MOVAPSrr, X86::MOVAPSmr, 0, 16 },
+ { X86::MOVDQArr, X86::MOVDQAmr, 0, 16 },
+ { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, 0, 0 },
+ { X86::MOVPQIto64rr,X86::MOVPQI2QImr, 0, 0 },
+ { X86::MOVSDto64rr, X86::MOVSDto64mr, 0, 0 },
+ { X86::MOVSS2DIrr, X86::MOVSS2DImr, 0, 0 },
+ { X86::MOVUPDrr, X86::MOVUPDmr, 0, 0 },
+ { X86::MOVUPSrr, X86::MOVUPSmr, 0, 0 },
+ { X86::MUL16r, X86::MUL16m, 1, 0 },
+ { X86::MUL32r, X86::MUL32m, 1, 0 },
+ { X86::MUL64r, X86::MUL64m, 1, 0 },
+ { X86::MUL8r, X86::MUL8m, 1, 0 },
+ { X86::SETAEr, X86::SETAEm, 0, 0 },
+ { X86::SETAr, X86::SETAm, 0, 0 },
+ { X86::SETBEr, X86::SETBEm, 0, 0 },
+ { X86::SETBr, X86::SETBm, 0, 0 },
+ { X86::SETEr, X86::SETEm, 0, 0 },
+ { X86::SETGEr, X86::SETGEm, 0, 0 },
+ { X86::SETGr, X86::SETGm, 0, 0 },
+ { X86::SETLEr, X86::SETLEm, 0, 0 },
+ { X86::SETLr, X86::SETLm, 0, 0 },
+ { X86::SETNEr, X86::SETNEm, 0, 0 },
+ { X86::SETNOr, X86::SETNOm, 0, 0 },
+ { X86::SETNPr, X86::SETNPm, 0, 0 },
+ { X86::SETNSr, X86::SETNSm, 0, 0 },
+ { X86::SETOr, X86::SETOm, 0, 0 },
+ { X86::SETPr, X86::SETPm, 0, 0 },
+ { X86::SETSr, X86::SETSm, 0, 0 },
+ { X86::TAILJMPr, X86::TAILJMPm, 1, 0 },
+ { X86::TAILJMPr64, X86::TAILJMPm64, 1, 0 },
+ { X86::TEST16ri, X86::TEST16mi, 1, 0 },
+ { X86::TEST32ri, X86::TEST32mi, 1, 0 },
+ { X86::TEST64ri32, X86::TEST64mi32, 1, 0 },
+ { X86::TEST8ri, X86::TEST8mi, 1, 0 }
+ };
+
+ for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
+ unsigned RegOp = OpTbl0[i][0];
+ unsigned MemOp = OpTbl0[i][1];
+ unsigned Align = OpTbl0[i][3];
+ if (!RegOp2MemOpTable0.insert(std::make_pair((unsigned*)RegOp,
+ std::make_pair(MemOp,Align))).second)
+ assert(false && "Duplicated entries?");
+ unsigned FoldedLoad = OpTbl0[i][2];
+ // Index 0, folded load or store.
+ unsigned AuxInfo = 0 | (FoldedLoad << 4) | ((FoldedLoad^1) << 5);
+ if (RegOp != X86::FsMOVAPDrr && RegOp != X86::FsMOVAPSrr)
+ if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+ std::make_pair(RegOp, AuxInfo))).second)
+ AmbEntries.push_back(MemOp);
+ }
+
+ static const unsigned OpTbl1[][3] = {
+ { X86::CMP16rr, X86::CMP16rm, 0 },
+ { X86::CMP32rr, X86::CMP32rm, 0 },
+ { X86::CMP64rr, X86::CMP64rm, 0 },
+ { X86::CMP8rr, X86::CMP8rm, 0 },
+ { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 },
+ { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 },
+ { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 },
+ { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 },
+ { X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 },
+ { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 },
+ { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 },
+ { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 },
+ { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 },
+ { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 },
+ { X86::FsMOVAPDrr, X86::MOVSDrm, 0 },
+ { X86::FsMOVAPSrr, X86::MOVSSrm, 0 },
+ { X86::IMUL16rri, X86::IMUL16rmi, 0 },
+ { X86::IMUL16rri8, X86::IMUL16rmi8, 0 },
+ { X86::IMUL32rri, X86::IMUL32rmi, 0 },
+ { X86::IMUL32rri8, X86::IMUL32rmi8, 0 },
+ { X86::IMUL64rri32, X86::IMUL64rmi32, 0 },
+ { X86::IMUL64rri8, X86::IMUL64rmi8, 0 },
+ { X86::Int_CMPSDrr, X86::Int_CMPSDrm, 0 },
+ { X86::Int_CMPSSrr, X86::Int_CMPSSrm, 0 },
+ { X86::Int_COMISDrr, X86::Int_COMISDrm, 0 },
+ { X86::Int_COMISSrr, X86::Int_COMISSrm, 0 },
+ { X86::Int_CVTDQ2PDrr, X86::Int_CVTDQ2PDrm, 16 },
+ { X86::Int_CVTDQ2PSrr, X86::Int_CVTDQ2PSrm, 16 },
+ { X86::Int_CVTPD2DQrr, X86::Int_CVTPD2DQrm, 16 },
+ { X86::Int_CVTPD2PSrr, X86::Int_CVTPD2PSrm, 16 },
+ { X86::Int_CVTPS2DQrr, X86::Int_CVTPS2DQrm, 16 },
+ { X86::Int_CVTPS2PDrr, X86::Int_CVTPS2PDrm, 0 },
+ { X86::Int_CVTSD2SI64rr,X86::Int_CVTSD2SI64rm, 0 },
+ { X86::Int_CVTSD2SIrr, X86::Int_CVTSD2SIrm, 0 },
+ { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, 0 },
+ { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
+ { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 },
+ { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 },
+ { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 },
+ { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, 0 },
+ { X86::Int_CVTSS2SI64rr,X86::Int_CVTSS2SI64rm, 0 },
+ { X86::Int_CVTSS2SIrr, X86::Int_CVTSS2SIrm, 0 },
+ { X86::Int_CVTTPD2DQrr, X86::Int_CVTTPD2DQrm, 16 },
+ { X86::Int_CVTTPS2DQrr, X86::Int_CVTTPS2DQrm, 16 },
+ { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 },
+ { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 },
+ { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 },
+ { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 },
+ { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, 0 },
+ { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, 0 },
+ { X86::MOV16rr, X86::MOV16rm, 0 },
+ { X86::MOV32rr, X86::MOV32rm, 0 },
+ { X86::MOV32rr_TC, X86::MOV32rm_TC, 0 },
+ { X86::MOV64rr, X86::MOV64rm, 0 },
+ { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 },
+ { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 },
+ { X86::MOV8rr, X86::MOV8rm, 0 },
+ { X86::MOVAPDrr, X86::MOVAPDrm, 16 },
+ { X86::MOVAPSrr, X86::MOVAPSrm, 16 },
+ { X86::MOVDDUPrr, X86::MOVDDUPrm, 0 },
+ { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
+ { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
+ { X86::MOVDQArr, X86::MOVDQArm, 16 },
+ { X86::MOVSHDUPrr, X86::MOVSHDUPrm, 16 },
+ { X86::MOVSLDUPrr, X86::MOVSLDUPrm, 16 },
+ { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
+ { X86::MOVSX32rr16, X86::MOVSX32rm16, 0 },
+ { X86::MOVSX32rr8, X86::MOVSX32rm8, 0 },
+ { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 },
+ { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 },
+ { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
+ { X86::MOVUPDrr, X86::MOVUPDrm, 16 },
+ { X86::MOVUPSrr, X86::MOVUPSrm, 0 },
+ { X86::MOVZDI2PDIrr, X86::MOVZDI2PDIrm, 0 },
+ { X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 },
+ { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, 16 },
+ { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
+ { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
+ { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
+ { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },
+ { X86::MOVZX64rr16, X86::MOVZX64rm16, 0 },
+ { X86::MOVZX64rr32, X86::MOVZX64rm32, 0 },
+ { X86::MOVZX64rr8, X86::MOVZX64rm8, 0 },
+ { X86::PSHUFDri, X86::PSHUFDmi, 16 },
+ { X86::PSHUFHWri, X86::PSHUFHWmi, 16 },
+ { X86::PSHUFLWri, X86::PSHUFLWmi, 16 },
+ { X86::RCPPSr, X86::RCPPSm, 16 },
+ { X86::RCPPSr_Int, X86::RCPPSm_Int, 16 },
+ { X86::RSQRTPSr, X86::RSQRTPSm, 16 },
+ { X86::RSQRTPSr_Int, X86::RSQRTPSm_Int, 16 },
+ { X86::RSQRTSSr, X86::RSQRTSSm, 0 },
+ { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 },
+ { X86::SQRTPDr, X86::SQRTPDm, 16 },
+ { X86::SQRTPDr_Int, X86::SQRTPDm_Int, 16 },
+ { X86::SQRTPSr, X86::SQRTPSm, 16 },
+ { X86::SQRTPSr_Int, X86::SQRTPSm_Int, 16 },
+ { X86::SQRTSDr, X86::SQRTSDm, 0 },
+ { X86::SQRTSDr_Int, X86::SQRTSDm_Int, 0 },
+ { X86::SQRTSSr, X86::SQRTSSm, 0 },
+ { X86::SQRTSSr_Int, X86::SQRTSSm_Int, 0 },
+ { X86::TEST16rr, X86::TEST16rm, 0 },
+ { X86::TEST32rr, X86::TEST32rm, 0 },
+ { X86::TEST64rr, X86::TEST64rm, 0 },
+ { X86::TEST8rr, X86::TEST8rm, 0 },
+ // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+ { X86::UCOMISDrr, X86::UCOMISDrm, 0 },
+ { X86::UCOMISSrr, X86::UCOMISSrm, 0 }
+ };
+
+ for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
+ unsigned RegOp = OpTbl1[i][0];
+ unsigned MemOp = OpTbl1[i][1];
+ unsigned Align = OpTbl1[i][2];
+ if (!RegOp2MemOpTable1.insert(std::make_pair((unsigned*)RegOp,
+ std::make_pair(MemOp,Align))).second)
+ assert(false && "Duplicated entries?");
+ // Index 1, folded load
+ unsigned AuxInfo = 1 | (1 << 4);
+ if (RegOp != X86::FsMOVAPDrr && RegOp != X86::FsMOVAPSrr)
+ if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+ std::make_pair(RegOp, AuxInfo))).second)
+ AmbEntries.push_back(MemOp);
+ }
+
+ static const unsigned OpTbl2[][3] = {
+ { X86::ADC32rr, X86::ADC32rm, 0 },
+ { X86::ADC64rr, X86::ADC64rm, 0 },
+ { X86::ADD16rr, X86::ADD16rm, 0 },
+ { X86::ADD32rr, X86::ADD32rm, 0 },
+ { X86::ADD64rr, X86::ADD64rm, 0 },
+ { X86::ADD8rr, X86::ADD8rm, 0 },
+ { X86::ADDPDrr, X86::ADDPDrm, 16 },
+ { X86::ADDPSrr, X86::ADDPSrm, 16 },
+ { X86::ADDSDrr, X86::ADDSDrm, 0 },
+ { X86::ADDSSrr, X86::ADDSSrm, 0 },
+ { X86::ADDSUBPDrr, X86::ADDSUBPDrm, 16 },
+ { X86::ADDSUBPSrr, X86::ADDSUBPSrm, 16 },
+ { X86::AND16rr, X86::AND16rm, 0 },
+ { X86::AND32rr, X86::AND32rm, 0 },
+ { X86::AND64rr, X86::AND64rm, 0 },
+ { X86::AND8rr, X86::AND8rm, 0 },
+ { X86::ANDNPDrr, X86::ANDNPDrm, 16 },
+ { X86::ANDNPSrr, X86::ANDNPSrm, 16 },
+ { X86::ANDPDrr, X86::ANDPDrm, 16 },
+ { X86::ANDPSrr, X86::ANDPSrm, 16 },
+ { X86::CMOVA16rr, X86::CMOVA16rm, 0 },
+ { X86::CMOVA32rr, X86::CMOVA32rm, 0 },
+ { X86::CMOVA64rr, X86::CMOVA64rm, 0 },
+ { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 },
+ { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 },
+ { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 },
+ { X86::CMOVB16rr, X86::CMOVB16rm, 0 },
+ { X86::CMOVB32rr, X86::CMOVB32rm, 0 },
+ { X86::CMOVB64rr, X86::CMOVB64rm, 0 },
+ { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 },
+ { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 },
+ { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 },
+ { X86::CMOVE16rr, X86::CMOVE16rm, 0 },
+ { X86::CMOVE32rr, X86::CMOVE32rm, 0 },
+ { X86::CMOVE64rr, X86::CMOVE64rm, 0 },
+ { X86::CMOVG16rr, X86::CMOVG16rm, 0 },
+ { X86::CMOVG32rr, X86::CMOVG32rm, 0 },
+ { X86::CMOVG64rr, X86::CMOVG64rm, 0 },
+ { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 },
+ { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 },
+ { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 },
+ { X86::CMOVL16rr, X86::CMOVL16rm, 0 },
+ { X86::CMOVL32rr, X86::CMOVL32rm, 0 },
+ { X86::CMOVL64rr, X86::CMOVL64rm, 0 },
+ { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 },
+ { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 },
+ { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 },
+ { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 },
+ { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 },
+ { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 },
+ { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 },
+ { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 },
+ { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 },
+ { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 },
+ { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 },
+ { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 },
+ { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 },
+ { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 },
+ { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 },
+ { X86::CMOVO16rr, X86::CMOVO16rm, 0 },
+ { X86::CMOVO32rr, X86::CMOVO32rm, 0 },
+ { X86::CMOVO64rr, X86::CMOVO64rm, 0 },
+ { X86::CMOVP16rr, X86::CMOVP16rm, 0 },
+ { X86::CMOVP32rr, X86::CMOVP32rm, 0 },
+ { X86::CMOVP64rr, X86::CMOVP64rm, 0 },
+ { X86::CMOVS16rr, X86::CMOVS16rm, 0 },
+ { X86::CMOVS32rr, X86::CMOVS32rm, 0 },
+ { X86::CMOVS64rr, X86::CMOVS64rm, 0 },
+ { X86::CMPPDrri, X86::CMPPDrmi, 16 },
+ { X86::CMPPSrri, X86::CMPPSrmi, 16 },
+ { X86::CMPSDrr, X86::CMPSDrm, 0 },
+ { X86::CMPSSrr, X86::CMPSSrm, 0 },
+ { X86::DIVPDrr, X86::DIVPDrm, 16 },
+ { X86::DIVPSrr, X86::DIVPSrm, 16 },
+ { X86::DIVSDrr, X86::DIVSDrm, 0 },
+ { X86::DIVSSrr, X86::DIVSSrm, 0 },
+ { X86::FsANDNPDrr, X86::FsANDNPDrm, 16 },
+ { X86::FsANDNPSrr, X86::FsANDNPSrm, 16 },
+ { X86::FsANDPDrr, X86::FsANDPDrm, 16 },
+ { X86::FsANDPSrr, X86::FsANDPSrm, 16 },
+ { X86::FsORPDrr, X86::FsORPDrm, 16 },
+ { X86::FsORPSrr, X86::FsORPSrm, 16 },
+ { X86::FsXORPDrr, X86::FsXORPDrm, 16 },
+ { X86::FsXORPSrr, X86::FsXORPSrm, 16 },
+ { X86::HADDPDrr, X86::HADDPDrm, 16 },
+ { X86::HADDPSrr, X86::HADDPSrm, 16 },
+ { X86::HSUBPDrr, X86::HSUBPDrm, 16 },
+ { X86::HSUBPSrr, X86::HSUBPSrm, 16 },
+ { X86::IMUL16rr, X86::IMUL16rm, 0 },
+ { X86::IMUL32rr, X86::IMUL32rm, 0 },
+ { X86::IMUL64rr, X86::IMUL64rm, 0 },
+ { X86::MAXPDrr, X86::MAXPDrm, 16 },
+ { X86::MAXPDrr_Int, X86::MAXPDrm_Int, 16 },
+ { X86::MAXPSrr, X86::MAXPSrm, 16 },
+ { X86::MAXPSrr_Int, X86::MAXPSrm_Int, 16 },
+ { X86::MAXSDrr, X86::MAXSDrm, 0 },
+ { X86::MAXSDrr_Int, X86::MAXSDrm_Int, 0 },
+ { X86::MAXSSrr, X86::MAXSSrm, 0 },
+ { X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 },
+ { X86::MINPDrr, X86::MINPDrm, 16 },
+ { X86::MINPDrr_Int, X86::MINPDrm_Int, 16 },
+ { X86::MINPSrr, X86::MINPSrm, 16 },
+ { X86::MINPSrr_Int, X86::MINPSrm_Int, 16 },
+ { X86::MINSDrr, X86::MINSDrm, 0 },
+ { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 },
+ { X86::MINSSrr, X86::MINSSrm, 0 },
+ { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 },
+ { X86::MULPDrr, X86::MULPDrm, 16 },
+ { X86::MULPSrr, X86::MULPSrm, 16 },
+ { X86::MULSDrr, X86::MULSDrm, 0 },
+ { X86::MULSSrr, X86::MULSSrm, 0 },
+ { X86::OR16rr, X86::OR16rm, 0 },
+ { X86::OR32rr, X86::OR32rm, 0 },
+ { X86::OR64rr, X86::OR64rm, 0 },
+ { X86::OR8rr, X86::OR8rm, 0 },
+ { X86::ORPDrr, X86::ORPDrm, 16 },
+ { X86::ORPSrr, X86::ORPSrm, 16 },
+ { X86::PACKSSDWrr, X86::PACKSSDWrm, 16 },
+ { X86::PACKSSWBrr, X86::PACKSSWBrm, 16 },
+ { X86::PACKUSWBrr, X86::PACKUSWBrm, 16 },
+ { X86::PADDBrr, X86::PADDBrm, 16 },
+ { X86::PADDDrr, X86::PADDDrm, 16 },
+ { X86::PADDQrr, X86::PADDQrm, 16 },
+ { X86::PADDSBrr, X86::PADDSBrm, 16 },
+ { X86::PADDSWrr, X86::PADDSWrm, 16 },
+ { X86::PADDWrr, X86::PADDWrm, 16 },
+ { X86::PANDNrr, X86::PANDNrm, 16 },
+ { X86::PANDrr, X86::PANDrm, 16 },
+ { X86::PAVGBrr, X86::PAVGBrm, 16 },
+ { X86::PAVGWrr, X86::PAVGWrm, 16 },
+ { X86::PCMPEQBrr, X86::PCMPEQBrm, 16 },
+ { X86::PCMPEQDrr, X86::PCMPEQDrm, 16 },
+ { X86::PCMPEQWrr, X86::PCMPEQWrm, 16 },
+ { X86::PCMPGTBrr, X86::PCMPGTBrm, 16 },
+ { X86::PCMPGTDrr, X86::PCMPGTDrm, 16 },
+ { X86::PCMPGTWrr, X86::PCMPGTWrm, 16 },
+ { X86::PINSRWrri, X86::PINSRWrmi, 16 },
+ { X86::PMADDWDrr, X86::PMADDWDrm, 16 },
+ { X86::PMAXSWrr, X86::PMAXSWrm, 16 },
+ { X86::PMAXUBrr, X86::PMAXUBrm, 16 },
+ { X86::PMINSWrr, X86::PMINSWrm, 16 },
+ { X86::PMINUBrr, X86::PMINUBrm, 16 },
+ { X86::PMULDQrr, X86::PMULDQrm, 16 },
+ { X86::PMULHUWrr, X86::PMULHUWrm, 16 },
+ { X86::PMULHWrr, X86::PMULHWrm, 16 },
+ { X86::PMULLDrr, X86::PMULLDrm, 16 },
+ { X86::PMULLWrr, X86::PMULLWrm, 16 },
+ { X86::PMULUDQrr, X86::PMULUDQrm, 16 },
+ { X86::PORrr, X86::PORrm, 16 },
+ { X86::PSADBWrr, X86::PSADBWrm, 16 },
+ { X86::PSLLDrr, X86::PSLLDrm, 16 },
+ { X86::PSLLQrr, X86::PSLLQrm, 16 },
+ { X86::PSLLWrr, X86::PSLLWrm, 16 },
+ { X86::PSRADrr, X86::PSRADrm, 16 },
+ { X86::PSRAWrr, X86::PSRAWrm, 16 },
+ { X86::PSRLDrr, X86::PSRLDrm, 16 },
+ { X86::PSRLQrr, X86::PSRLQrm, 16 },
+ { X86::PSRLWrr, X86::PSRLWrm, 16 },
+ { X86::PSUBBrr, X86::PSUBBrm, 16 },
+ { X86::PSUBDrr, X86::PSUBDrm, 16 },
+ { X86::PSUBSBrr, X86::PSUBSBrm, 16 },
+ { X86::PSUBSWrr, X86::PSUBSWrm, 16 },
+ { X86::PSUBWrr, X86::PSUBWrm, 16 },
+ { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, 16 },
+ { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, 16 },
+ { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, 16 },
+ { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, 16 },
+ { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, 16 },
+ { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, 16 },
+ { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, 16 },
+ { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, 16 },
+ { X86::PXORrr, X86::PXORrm, 16 },
+ { X86::SBB32rr, X86::SBB32rm, 0 },
+ { X86::SBB64rr, X86::SBB64rm, 0 },
+ { X86::SHUFPDrri, X86::SHUFPDrmi, 16 },
+ { X86::SHUFPSrri, X86::SHUFPSrmi, 16 },
+ { X86::SUB16rr, X86::SUB16rm, 0 },
+ { X86::SUB32rr, X86::SUB32rm, 0 },
+ { X86::SUB64rr, X86::SUB64rm, 0 },
+ { X86::SUB8rr, X86::SUB8rm, 0 },
+ { X86::SUBPDrr, X86::SUBPDrm, 16 },
+ { X86::SUBPSrr, X86::SUBPSrm, 16 },
+ { X86::SUBSDrr, X86::SUBSDrm, 0 },
+ { X86::SUBSSrr, X86::SUBSSrm, 0 },
+ // FIXME: TEST*rr -> swapped operand of TEST*mr.
+ { X86::UNPCKHPDrr, X86::UNPCKHPDrm, 16 },
+ { X86::UNPCKHPSrr, X86::UNPCKHPSrm, 16 },
+ { X86::UNPCKLPDrr, X86::UNPCKLPDrm, 16 },
+ { X86::UNPCKLPSrr, X86::UNPCKLPSrm, 16 },
+ { X86::XOR16rr, X86::XOR16rm, 0 },
+ { X86::XOR32rr, X86::XOR32rm, 0 },
+ { X86::XOR64rr, X86::XOR64rm, 0 },
+ { X86::XOR8rr, X86::XOR8rm, 0 },
+ { X86::XORPDrr, X86::XORPDrm, 16 },
+ { X86::XORPSrr, X86::XORPSrm, 16 }
+ };
+
+ for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
+ unsigned RegOp = OpTbl2[i][0];
+ unsigned MemOp = OpTbl2[i][1];
+ unsigned Align = OpTbl2[i][2];
+ if (!RegOp2MemOpTable2.insert(std::make_pair((unsigned*)RegOp,
+ std::make_pair(MemOp,Align))).second)
+ assert(false && "Duplicated entries?");
+ // Index 2, folded load
+ unsigned AuxInfo = 2 | (1 << 4);
+ if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+ std::make_pair(RegOp, AuxInfo))).second)
+ AmbEntries.push_back(MemOp);
+ }
+
+ // Remove ambiguous entries.
+ assert(AmbEntries.empty() && "Duplicated entries in unfolding maps?");
+}
+
+bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+ case X86::MOV8rr:
+ case X86::MOV8rr_NOREX:
+ case X86::MOV16rr:
+ case X86::MOV32rr:
+ case X86::MOV64rr:
+ case X86::MOV32rr_TC:
+ case X86::MOV64rr_TC:
+
+ // FP Stack register class copies
+ case X86::MOV_Fp3232: case X86::MOV_Fp6464: case X86::MOV_Fp8080:
+ case X86::MOV_Fp3264: case X86::MOV_Fp3280:
+ case X86::MOV_Fp6432: case X86::MOV_Fp8032:
+
+ // Note that MOVSSrr and MOVSDrr are not considered copies. FR32 and FR64
+ // copies are done with FsMOVAPSrr and FsMOVAPDrr.
+
+ case X86::FsMOVAPSrr:
+ case X86::FsMOVAPDrr:
+ case X86::MOVAPSrr:
+ case X86::MOVAPDrr:
+ case X86::MOVDQArr:
+ case X86::MMX_MOVQ64rr:
+ assert(MI.getNumOperands() >= 2 &&
+ MI.getOperand(0).isReg() &&
+ MI.getOperand(1).isReg() &&
+ "invalid register-register move instruction");
+ SrcReg = MI.getOperand(1).getReg();
+ DstReg = MI.getOperand(0).getReg();
+ SrcSubIdx = MI.getOperand(1).getSubReg();
+ DstSubIdx = MI.getOperand(0).getSubReg();
+ return true;
+ }
+}
+
+bool
+X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SubIdx) const {
+ switch (MI.getOpcode()) {
+ default: break;
+ case X86::MOVSX16rr8:
+ case X86::MOVZX16rr8:
+ case X86::MOVSX32rr8:
+ case X86::MOVZX32rr8:
+ case X86::MOVSX64rr8:
+ case X86::MOVZX64rr8:
+ if (!TM.getSubtarget<X86Subtarget>().is64Bit())
+ // It's not always legal to reference the low 8-bit of the larger
+ // register in 32-bit mode.
+ return false;
+ case X86::MOVSX32rr16:
+ case X86::MOVZX32rr16:
+ case X86::MOVSX64rr16:
+ case X86::MOVZX64rr16:
+ case X86::MOVSX64rr32:
+ case X86::MOVZX64rr32: {
+ if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+ // Be conservative.
+ return false;
+ SrcReg = MI.getOperand(1).getReg();
+ DstReg = MI.getOperand(0).getReg();
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable(0);
+ break;
+ case X86::MOVSX16rr8:
+ case X86::MOVZX16rr8:
+ case X86::MOVSX32rr8:
+ case X86::MOVZX32rr8:
+ case X86::MOVSX64rr8:
+ case X86::MOVZX64rr8:
+ SubIdx = X86::sub_8bit;
+ break;
+ case X86::MOVSX32rr16:
+ case X86::MOVZX32rr16:
+ case X86::MOVSX64rr16:
+ case X86::MOVZX64rr16:
+ SubIdx = X86::sub_16bit;
+ break;
+ case X86::MOVSX64rr32:
+ case X86::MOVZX64rr32:
+ SubIdx = X86::sub_32bit;
+ break;
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
+/// isFrameOperand - Return true and the FrameIndex if the specified
+/// operand and follow operands form a reference to the stack frame.
+bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
+ int &FrameIndex) const {
+ if (MI->getOperand(Op).isFI() && MI->getOperand(Op+1).isImm() &&
+ MI->getOperand(Op+2).isReg() && MI->getOperand(Op+3).isImm() &&
+ MI->getOperand(Op+1).getImm() == 1 &&
+ MI->getOperand(Op+2).getReg() == 0 &&
+ MI->getOperand(Op+3).getImm() == 0) {
+ FrameIndex = MI->getOperand(Op).getIndex();
+ return true;
+ }
+ return false;
+}
+
+static bool isFrameLoadOpcode(int Opcode) {
+ switch (Opcode) {
+ default: break;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp64m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVAPDrm:
+ case X86::MOVDQArm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ return true;
+ break;
+ }
+ return false;
+}
+
+static bool isFrameStoreOpcode(int Opcode) {
+ switch (Opcode) {
+ default: break;
+ case X86::MOV8mr:
+ case X86::MOV16mr:
+ case X86::MOV32mr:
+ case X86::MOV64mr:
+ case X86::ST_FpP64m:
+ case X86::MOVSSmr:
+ case X86::MOVSDmr:
+ case X86::MOVAPSmr:
+ case X86::MOVAPDmr:
+ case X86::MOVDQAmr:
+ case X86::MMX_MOVD64mr:
+ case X86::MMX_MOVQ64mr:
+ case X86::MMX_MOVNTQmr:
+ return true;
+ }
+ return false;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const {
+ if (isFrameLoadOpcode(MI->getOpcode()))
+ if (isFrameOperand(MI, 1, FrameIndex))
+ return MI->getOperand(0).getReg();
+ return 0;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+ int &FrameIndex) const {
+ if (isFrameLoadOpcode(MI->getOpcode())) {
+ unsigned Reg;
+ if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
+ return Reg;
+ // Check for post-frame index elimination operations
+ const MachineMemOperand *Dummy;
+ return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+ }
+ return 0;
+}
+
+bool X86InstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
+ const MachineMemOperand *&MMO,
+ int &FrameIndex) const {
+ for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
+ oe = MI->memoperands_end();
+ o != oe;
+ ++o) {
+ if ((*o)->isLoad() && (*o)->getValue())
+ if (const FixedStackPseudoSourceValue *Value =
+ dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
+ FrameIndex = Value->getFrameIndex();
+ MMO = *o;
+ return true;
+ }
+ }
+ return false;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+ int &FrameIndex) const {
+ if (isFrameStoreOpcode(MI->getOpcode()))
+ if (isFrameOperand(MI, 0, FrameIndex))
+ return MI->getOperand(X86AddrNumOperands).getReg();
+ return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
+ int &FrameIndex) const {
+ if (isFrameStoreOpcode(MI->getOpcode())) {
+ unsigned Reg;
+ if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
+ return Reg;
+ // Check for post-frame index elimination operations
+ const MachineMemOperand *Dummy;
+ return hasStoreToStackSlot(MI, Dummy, FrameIndex);
+ }
+ return 0;
+}
+
+bool X86InstrInfo::hasStoreToStackSlot(const MachineInstr *MI,
+ const MachineMemOperand *&MMO,
+ int &FrameIndex) const {
+ for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
+ oe = MI->memoperands_end();
+ o != oe;
+ ++o) {
+ if ((*o)->isStore() && (*o)->getValue())
+ if (const FixedStackPseudoSourceValue *Value =
+ dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
+ FrameIndex = Value->getFrameIndex();
+ MMO = *o;
+ return true;
+ }
+ }
+ return false;
+}
+
+/// regIsPICBase - Return true if register is PIC base (i.e.g defined by
+/// X86::MOVPC32r.
+static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
+ bool isPICBase = false;
+ for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg),
+ E = MRI.def_end(); I != E; ++I) {
+ MachineInstr *DefMI = I.getOperand().getParent();
+ if (DefMI->getOpcode() != X86::MOVPC32r)
+ return false;
+ assert(!isPICBase && "More than one PIC base?");
+ isPICBase = true;
+ }
+ return isPICBase;
+}
+
+bool
+X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
+ AliasAnalysis *AA) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp64m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVUPSrm_Int:
+ case X86::MOVAPDrm:
+ case X86::MOVDQArm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::FsMOVAPSrm:
+ case X86::FsMOVAPDrm: {
+ // Loads from constant pools are trivially rematerializable.
+ if (MI->getOperand(1).isReg() &&
+ MI->getOperand(2).isImm() &&
+ MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+ MI->isInvariantLoad(AA)) {
+ unsigned BaseReg = MI->getOperand(1).getReg();
+ if (BaseReg == 0 || BaseReg == X86::RIP)
+ return true;
+ // Allow re-materialization of PIC load.
+ if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal())
+ return false;
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ bool isPICBase = false;
+ for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg),
+ E = MRI.def_end(); I != E; ++I) {
+ MachineInstr *DefMI = I.getOperand().getParent();
+ if (DefMI->getOpcode() != X86::MOVPC32r)
+ return false;
+ assert(!isPICBase && "More than one PIC base?");
+ isPICBase = true;
+ }
+ return isPICBase;
+ }
+ return false;
+ }
+
+ case X86::LEA32r:
+ case X86::LEA64r: {
+ if (MI->getOperand(2).isImm() &&
+ MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+ !MI->getOperand(4).isReg()) {
+ // lea fi#, lea GV, etc. are all rematerializable.
+ if (!MI->getOperand(1).isReg())
+ return true;
+ unsigned BaseReg = MI->getOperand(1).getReg();
+ if (BaseReg == 0)
+ return true;
+ // Allow re-materialization of lea PICBase + x.
+ const MachineFunction &MF = *MI->getParent()->getParent();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ return regIsPICBase(BaseReg, MRI);
+ }
+ return false;
+ }
+ }
+
+ // All other instructions marked M_REMATERIALIZABLE are always trivially
+ // rematerializable.
+ return true;
+}
+
+/// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction that
+/// would clobber the EFLAGS condition register. Note the result may be
+/// conservative. If it cannot definitely determine the safety after visiting
+/// a few instructions in each direction it assumes it's not safe.
+static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) {
+ MachineBasicBlock::iterator E = MBB.end();
+
+ // It's always safe to clobber EFLAGS at the end of a block.
+ if (I == E)
+ return true;
+
+ // For compile time consideration, if we are not able to determine the
+ // safety after visiting 4 instructions in each direction, we will assume
+ // it's not safe.
+ MachineBasicBlock::iterator Iter = I;
+ for (unsigned i = 0; i < 4; ++i) {
+ bool SeenDef = false;
+ for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+ MachineOperand &MO = Iter->getOperand(j);
+ if (!MO.isReg())
+ continue;
+ if (MO.getReg() == X86::EFLAGS) {
+ if (MO.isUse())
+ return false;
+ SeenDef = true;
+ }
+ }
+
+ if (SeenDef)
+ // This instruction defines EFLAGS, no need to look any further.
+ return true;
+ ++Iter;
+ // Skip over DBG_VALUE.
+ while (Iter != E && Iter->isDebugValue())
+ ++Iter;
+
+ // If we make it to the end of the block, it's safe to clobber EFLAGS.
+ if (Iter == E)
+ return true;
+ }
+
+ MachineBasicBlock::iterator B = MBB.begin();
+ Iter = I;
+ for (unsigned i = 0; i < 4; ++i) {
+ // If we make it to the beginning of the block, it's safe to clobber
+ // EFLAGS iff EFLAGS is not live-in.
+ if (Iter == B)
+ return !MBB.isLiveIn(X86::EFLAGS);
+
+ --Iter;
+ // Skip over DBG_VALUE.
+ while (Iter != B && Iter->isDebugValue())
+ --Iter;
+
+ bool SawKill = false;
+ for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+ MachineOperand &MO = Iter->getOperand(j);
+ if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
+ if (MO.isDef()) return MO.isDead();
+ if (MO.isKill()) SawKill = true;
+ }
+ }
+
+ if (SawKill)
+ // This instruction kills EFLAGS and doesn't redefine it, so
+ // there's no need to look further.
+ return true;
+ }
+
+ // Conservative answer.
+ return false;
+}
+
+void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DestReg, unsigned SubIdx,
+ const MachineInstr *Orig,
+ const TargetRegisterInfo *TRI) const {
+ DebugLoc DL = Orig->getDebugLoc();
+
+ if (SubIdx && TargetRegisterInfo::isPhysicalRegister(DestReg)) {
+ DestReg = TRI->getSubReg(DestReg, SubIdx);
+ SubIdx = 0;
+ }
+
+ // MOV32r0 etc. are implemented with xor which clobbers condition code.
+ // Re-materialize them as movri instructions to avoid side effects.
+ bool Clone = true;
+ unsigned Opc = Orig->getOpcode();
+ switch (Opc) {
+ default: break;
+ case X86::MOV8r0:
+ case X86::MOV16r0:
+ case X86::MOV32r0:
+ case X86::MOV64r0: {
+ if (!isSafeToClobberEFLAGS(MBB, I)) {
+ switch (Opc) {
+ default: break;
+ case X86::MOV8r0: Opc = X86::MOV8ri; break;
+ case X86::MOV16r0: Opc = X86::MOV16ri; break;
+ case X86::MOV32r0: Opc = X86::MOV32ri; break;
+ case X86::MOV64r0: Opc = X86::MOV64ri64i32; break;
+ }
+ Clone = false;
+ }
+ break;
+ }
+ }
+
+ if (Clone) {
+ MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+ MI->getOperand(0).setReg(DestReg);
+ MBB.insert(I, MI);
+ } else {
+ BuildMI(MBB, I, DL, get(Opc), DestReg).addImm(0);
+ }
+
+ MachineInstr *NewMI = prior(I);
+ NewMI->getOperand(0).setSubReg(SubIdx);
+}
+
+/// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that
+/// is not marked dead.
+static bool hasLiveCondCodeDef(MachineInstr *MI) {
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg() && MO.isDef() &&
+ MO.getReg() == X86::EFLAGS && !MO.isDead()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when
+/// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting
+/// to a 32-bit superregister and then truncating back down to a 16-bit
+/// subregister.
+MachineInstr *
+X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
+ MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const {
+ MachineInstr *MI = MBBI;
+ unsigned Dest = MI->getOperand(0).getReg();
+ unsigned Src = MI->getOperand(1).getReg();
+ bool isDead = MI->getOperand(0).isDead();
+ bool isKill = MI->getOperand(1).isKill();
+
+ unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit()
+ ? X86::LEA64_32r : X86::LEA32r;
+ MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+ unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+ unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+
+ // Build and insert into an implicit UNDEF value. This is OK because
+ // well be shifting and then extracting the lower 16-bits.
+ // This has the potential to cause partial register stall. e.g.
+ // movw (%rbp,%rcx,2), %dx
+ // leal -65(%rdx), %esi
+ // But testing has shown this *does* help performance in 64-bit mode (at
+ // least on modern x86 machines).
+ BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
+ MachineInstr *InsMI =
+ BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::INSERT_SUBREG),leaInReg)
+ .addReg(leaInReg)
+ .addReg(Src, getKillRegState(isKill))
+ .addImm(X86::sub_16bit);
+
+ MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(),
+ get(Opc), leaOutReg);
+ switch (MIOpc) {
+ default:
+ llvm_unreachable(0);
+ break;
+ case X86::SHL16ri: {
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ MIB.addReg(0).addImm(1 << ShAmt)
+ .addReg(leaInReg, RegState::Kill).addImm(0);
+ break;
+ }
+ case X86::INC16r:
+ case X86::INC64_16r:
+ addLeaRegOffset(MIB, leaInReg, true, 1);
+ break;
+ case X86::DEC16r:
+ case X86::DEC64_16r:
+ addLeaRegOffset(MIB, leaInReg, true, -1);
+ break;
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ addLeaRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());
+ break;
+ case X86::ADD16rr: {
+ unsigned Src2 = MI->getOperand(2).getReg();
+ bool isKill2 = MI->getOperand(2).isKill();
+ unsigned leaInReg2 = 0;
+ MachineInstr *InsMI2 = 0;
+ if (Src == Src2) {
+ // ADD16rr %reg1028<kill>, %reg1028
+ // just a single insert_subreg.
+ addRegReg(MIB, leaInReg, true, leaInReg, false);
+ } else {
+ leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+ // Build and insert into an implicit UNDEF value. This is OK because
+ // well be shifting and then extracting the lower 16-bits.
+ BuildMI(*MFI, MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2);
+ InsMI2 =
+ BuildMI(*MFI, MIB, MI->getDebugLoc(), get(X86::INSERT_SUBREG),leaInReg2)
+ .addReg(leaInReg2)
+ .addReg(Src2, getKillRegState(isKill2))
+ .addImm(X86::sub_16bit);
+ addRegReg(MIB, leaInReg, true, leaInReg2, true);
+ }
+ if (LV && isKill2 && InsMI2)
+ LV->replaceKillInstruction(Src2, MI, InsMI2);
+ break;
+ }
+ }
+
+ MachineInstr *NewMI = MIB;
+ MachineInstr *ExtMI =
+ BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::EXTRACT_SUBREG))
+ .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+ .addReg(leaOutReg, RegState::Kill)
+ .addImm(X86::sub_16bit);
+
+ if (LV) {
+ // Update live variables
+ LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
+ LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
+ if (isKill)
+ LV->replaceKillInstruction(Src, MI, InsMI);
+ if (isDead)
+ LV->replaceKillInstruction(Dest, MI, ExtMI);
+ }
+
+ return ExtMI;
+}
+
+/// convertToThreeAddress - This method must be implemented by targets that
+/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
+/// may be able to convert a two-address instruction into a true
+/// three-address instruction on demand. This allows the X86 target (for
+/// example) to convert ADD and SHL instructions into LEA instructions if they
+/// would require register copies due to two-addressness.
+///
+/// This method returns a null pointer if the transformation cannot be
+/// performed, otherwise it returns the new instruction.
+///
+MachineInstr *
+X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const {
+ MachineInstr *MI = MBBI;
+ MachineFunction &MF = *MI->getParent()->getParent();
+ // All instructions input are two-addr instructions. Get the known operands.
+ unsigned Dest = MI->getOperand(0).getReg();
+ unsigned Src = MI->getOperand(1).getReg();
+ bool isDead = MI->getOperand(0).isDead();
+ bool isKill = MI->getOperand(1).isKill();
+
+ MachineInstr *NewMI = NULL;
+ // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When
+ // we have better subtarget support, enable the 16-bit LEA generation here.
+ // 16-bit LEA is also slow on Core2.
+ bool DisableLEA16 = true;
+ bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+
+ unsigned MIOpc = MI->getOpcode();
+ switch (MIOpc) {
+ case X86::SHUFPSrri: {
+ assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
+ if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+
+ unsigned B = MI->getOperand(1).getReg();
+ unsigned C = MI->getOperand(2).getReg();
+ if (B != C) return 0;
+ unsigned A = MI->getOperand(0).getReg();
+ unsigned M = MI->getOperand(3).getImm();
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
+ .addReg(A, RegState::Define | getDeadRegState(isDead))
+ .addReg(B, getKillRegState(isKill)).addImm(M);
+ break;
+ }
+ case X86::SHL64ri: {
+ assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+ // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+ // the flags produced by a shift yet, so this is safe.
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+ .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+ .addReg(0).addImm(1 << ShAmt)
+ .addReg(Src, getKillRegState(isKill))
+ .addImm(0);
+ break;
+ }
+ case X86::SHL32ri: {
+ assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+ // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+ // the flags produced by a shift yet, so this is safe.
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+ unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+ .addReg(0).addImm(1 << ShAmt)
+ .addReg(Src, getKillRegState(isKill)).addImm(0);
+ break;
+ }
+ case X86::SHL16ri: {
+ assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+ // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+ // the flags produced by a shift yet, so this is safe.
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+ NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+ .addReg(0).addImm(1 << ShAmt)
+ .addReg(Src, getKillRegState(isKill))
+ .addImm(0);
+ break;
+ }
+ default: {
+ // The following opcodes also sets the condition code register(s). Only
+ // convert them to equivalent lea if the condition code register def's
+ // are dead!
+ if (hasLiveCondCodeDef(MI))
+ return 0;
+
+ switch (MIOpc) {
+ default: return 0;
+ case X86::INC64r:
+ case X86::INC32r:
+ case X86::INC64_32r: {
+ assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+ unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
+ : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, 1);
+ break;
+ }
+ case X86::INC16r:
+ case X86::INC64_16r:
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+ assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+ NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, 1);
+ break;
+ case X86::DEC64r:
+ case X86::DEC32r:
+ case X86::DEC64_32r: {
+ assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+ unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
+ : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, -1);
+ break;
+ }
+ case X86::DEC16r:
+ case X86::DEC64_16r:
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+ assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+ NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, -1);
+ break;
+ case X86::ADD64rr:
+ case X86::ADD32rr: {
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc = MIOpc == X86::ADD64rr ? X86::LEA64r
+ : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ unsigned Src2 = MI->getOperand(2).getReg();
+ bool isKill2 = MI->getOperand(2).isKill();
+ NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, Src2, isKill2);
+ if (LV && isKill2)
+ LV->replaceKillInstruction(Src2, MI, NewMI);
+ break;
+ }
+ case X86::ADD16rr: {
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Src2 = MI->getOperand(2).getReg();
+ bool isKill2 = MI->getOperand(2).isKill();
+ NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, Src2, isKill2);
+ if (LV && isKill2)
+ LV->replaceKillInstruction(Src2, MI, NewMI);
+ break;
+ }
+ case X86::ADD64ri32:
+ case X86::ADD64ri8:
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, MI->getOperand(2).getImm());
+ break;
+ case X86::ADD32ri:
+ case X86::ADD32ri8: {
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+ NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, MI->getOperand(2).getImm());
+ break;
+ }
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ if (DisableLEA16)
+ return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+ assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+ NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+ .addReg(Dest, RegState::Define |
+ getDeadRegState(isDead)),
+ Src, isKill, MI->getOperand(2).getImm());
+ break;
+ }
+ }
+ }
+
+ if (!NewMI) return 0;
+
+ if (LV) { // Update live variables
+ if (isKill)
+ LV->replaceKillInstruction(Src, MI, NewMI);
+ if (isDead)
+ LV->replaceKillInstruction(Dest, MI, NewMI);
+ }
+
+ MFI->insert(MBBI, NewMI); // Insert the new inst
+ return NewMI;
+}
+
+/// commuteInstruction - We have a few instructions that must be hacked on to
+/// commute them.
+///
+MachineInstr *
+X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+ switch (MI->getOpcode()) {
+ case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
+ case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
+ case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
+ case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
+ case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
+ case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
+ unsigned Opc;
+ unsigned Size;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("Unreachable!");
+ case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
+ case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
+ case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
+ case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
+ case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
+ case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
+ }
+ unsigned Amt = MI->getOperand(3).getImm();
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
+ MI->setDesc(get(Opc));
+ MI->getOperand(3).setImm(Size-Amt);
+ return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
+ }
+ case X86::CMOVB16rr:
+ case X86::CMOVB32rr:
+ case X86::CMOVB64rr:
+ case X86::CMOVAE16rr:
+ case X86::CMOVAE32rr:
+ case X86::CMOVAE64rr:
+ case X86::CMOVE16rr:
+ case X86::CMOVE32rr:
+ case X86::CMOVE64rr:
+ case X86::CMOVNE16rr:
+ case X86::CMOVNE32rr:
+ case X86::CMOVNE64rr:
+ case X86::CMOVBE16rr:
+ case X86::CMOVBE32rr:
+ case X86::CMOVBE64rr:
+ case X86::CMOVA16rr:
+ case X86::CMOVA32rr:
+ case X86::CMOVA64rr:
+ case X86::CMOVL16rr:
+ case X86::CMOVL32rr:
+ case X86::CMOVL64rr:
+ case X86::CMOVGE16rr:
+ case X86::CMOVGE32rr:
+ case X86::CMOVGE64rr:
+ case X86::CMOVLE16rr:
+ case X86::CMOVLE32rr:
+ case X86::CMOVLE64rr:
+ case X86::CMOVG16rr:
+ case X86::CMOVG32rr:
+ case X86::CMOVG64rr:
+ case X86::CMOVS16rr:
+ case X86::CMOVS32rr:
+ case X86::CMOVS64rr:
+ case X86::CMOVNS16rr:
+ case X86::CMOVNS32rr:
+ case X86::CMOVNS64rr:
+ case X86::CMOVP16rr:
+ case X86::CMOVP32rr:
+ case X86::CMOVP64rr:
+ case X86::CMOVNP16rr:
+ case X86::CMOVNP32rr:
+ case X86::CMOVNP64rr:
+ case X86::CMOVO16rr:
+ case X86::CMOVO32rr:
+ case X86::CMOVO64rr:
+ case X86::CMOVNO16rr:
+ case X86::CMOVNO32rr:
+ case X86::CMOVNO64rr: {
+ unsigned Opc = 0;
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break;
+ case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break;
+ case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break;
+ case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
+ case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
+ case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
+ case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break;
+ case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break;
+ case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break;
+ case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
+ case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
+ case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
+ case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
+ case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
+ case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
+ case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break;
+ case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break;
+ case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break;
+ case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break;
+ case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break;
+ case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break;
+ case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
+ case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
+ case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
+ case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
+ case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
+ case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
+ case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break;
+ case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break;
+ case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break;
+ case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break;
+ case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break;
+ case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break;
+ case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
+ case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
+ case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
+ case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break;
+ case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break;
+ case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break;
+ case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
+ case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
+ case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
+ case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break;
+ case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break;
+ case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break;
+ case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
+ case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
+ case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
+ }
+ if (NewMI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MI = MF.CloneMachineInstr(MI);
+ NewMI = false;
+ }
+ MI->setDesc(get(Opc));
+ // Fallthrough intended.
+ }
+ default:
+ return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
+ }
+}
+
+static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) {
+ switch (BrOpc) {
+ default: return X86::COND_INVALID;
+ case X86::JE_4: return X86::COND_E;
+ case X86::JNE_4: return X86::COND_NE;
+ case X86::JL_4: return X86::COND_L;
+ case X86::JLE_4: return X86::COND_LE;
+ case X86::JG_4: return X86::COND_G;
+ case X86::JGE_4: return X86::COND_GE;
+ case X86::JB_4: return X86::COND_B;
+ case X86::JBE_4: return X86::COND_BE;
+ case X86::JA_4: return X86::COND_A;
+ case X86::JAE_4: return X86::COND_AE;
+ case X86::JS_4: return X86::COND_S;
+ case X86::JNS_4: return X86::COND_NS;
+ case X86::JP_4: return X86::COND_P;
+ case X86::JNP_4: return X86::COND_NP;
+ case X86::JO_4: return X86::COND_O;
+ case X86::JNO_4: return X86::COND_NO;
+ }
+}
+
+unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
+ switch (CC) {
+ default: llvm_unreachable("Illegal condition code!");
+ case X86::COND_E: return X86::JE_4;
+ case X86::COND_NE: return X86::JNE_4;
+ case X86::COND_L: return X86::JL_4;
+ case X86::COND_LE: return X86::JLE_4;
+ case X86::COND_G: return X86::JG_4;
+ case X86::COND_GE: return X86::JGE_4;
+ case X86::COND_B: return X86::JB_4;
+ case X86::COND_BE: return X86::JBE_4;
+ case X86::COND_A: return X86::JA_4;
+ case X86::COND_AE: return X86::JAE_4;
+ case X86::COND_S: return X86::JS_4;
+ case X86::COND_NS: return X86::JNS_4;
+ case X86::COND_P: return X86::JP_4;
+ case X86::COND_NP: return X86::JNP_4;
+ case X86::COND_O: return X86::JO_4;
+ case X86::COND_NO: return X86::JNO_4;
+ }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified condition,
+/// e.g. turning COND_E to COND_NE.
+X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
+ switch (CC) {
+ default: llvm_unreachable("Illegal condition code!");
+ case X86::COND_E: return X86::COND_NE;
+ case X86::COND_NE: return X86::COND_E;
+ case X86::COND_L: return X86::COND_GE;
+ case X86::COND_LE: return X86::COND_G;
+ case X86::COND_G: return X86::COND_LE;
+ case X86::COND_GE: return X86::COND_L;
+ case X86::COND_B: return X86::COND_AE;
+ case X86::COND_BE: return X86::COND_A;
+ case X86::COND_A: return X86::COND_BE;
+ case X86::COND_AE: return X86::COND_B;
+ case X86::COND_S: return X86::COND_NS;
+ case X86::COND_NS: return X86::COND_S;
+ case X86::COND_P: return X86::COND_NP;
+ case X86::COND_NP: return X86::COND_P;
+ case X86::COND_O: return X86::COND_NO;
+ case X86::COND_NO: return X86::COND_O;
+ }
+}
+
+bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+ const TargetInstrDesc &TID = MI->getDesc();
+ if (!TID.isTerminator()) return false;
+
+ // Conditional branch is a special case.
+ if (TID.isBranch() && !TID.isBarrier())
+ return true;
+ if (!TID.isPredicable())
+ return true;
+ return !isPredicated(MI);
+}
+
+// For purposes of branch analysis do not count FP_REG_KILL as a terminator.
+static bool isBrAnalysisUnpredicatedTerminator(const MachineInstr *MI,
+ const X86InstrInfo &TII) {
+ if (MI->getOpcode() == X86::FP_REG_KILL)
+ return false;
+ return TII.isUnpredicatedTerminator(MI);
+}
+
+bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const {
+ // Start from the bottom of the block and work up, examining the
+ // terminator instructions.
+ MachineBasicBlock::iterator I = MBB.end();
+ MachineBasicBlock::iterator UnCondBrIter = MBB.end();
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+
+ // Working from the bottom, when we see a non-terminator instruction, we're
+ // done.
+ if (!isBrAnalysisUnpredicatedTerminator(I, *this))
+ break;
+
+ // A terminator that isn't a branch can't easily be handled by this
+ // analysis.
+ if (!I->getDesc().isBranch())
+ return true;
+
+ // Handle unconditional branches.
+ if (I->getOpcode() == X86::JMP_4) {
+ UnCondBrIter = I;
+
+ if (!AllowModify) {
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // If the block has any instructions after a JMP, delete them.
+ while (llvm::next(I) != MBB.end())
+ llvm::next(I)->eraseFromParent();
+
+ Cond.clear();
+ FBB = 0;
+
+ // Delete the JMP if it's equivalent to a fall-through.
+ if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+ TBB = 0;
+ I->eraseFromParent();
+ I = MBB.end();
+ UnCondBrIter = MBB.end();
+ continue;
+ }
+
+ // TBB is used to indicate the unconditional destination.
+ TBB = I->getOperand(0).getMBB();
+ continue;
+ }
+
+ // Handle conditional branches.
+ X86::CondCode BranchCode = GetCondFromBranchOpc(I->getOpcode());
+ if (BranchCode == X86::COND_INVALID)
+ return true; // Can't handle indirect branch.
+
+ // Working from the bottom, handle the first conditional branch.
+ if (Cond.empty()) {
+ MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
+ if (AllowModify && UnCondBrIter != MBB.end() &&
+ MBB.isLayoutSuccessor(TargetBB)) {
+ // If we can modify the code and it ends in something like:
+ //
+ // jCC L1
+ // jmp L2
+ // L1:
+ // ...
+ // L2:
+ //
+ // Then we can change this to:
+ //
+ // jnCC L2
+ // L1:
+ // ...
+ // L2:
+ //
+ // Which is a bit more efficient.
+ // We conditionally jump to the fall-through block.
+ BranchCode = GetOppositeBranchCondition(BranchCode);
+ unsigned JNCC = GetCondBranchFromCond(BranchCode);
+ MachineBasicBlock::iterator OldInst = I;
+
+ BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
+ .addMBB(UnCondBrIter->getOperand(0).getMBB());
+ BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_4))
+ .addMBB(TargetBB);
+ MBB.addSuccessor(TargetBB);
+
+ OldInst->eraseFromParent();
+ UnCondBrIter->eraseFromParent();
+
+ // Restart the analysis.
+ UnCondBrIter = MBB.end();
+ I = MBB.end();
+ continue;
+ }
+
+ FBB = TBB;
+ TBB = I->getOperand(0).getMBB();
+ Cond.push_back(MachineOperand::CreateImm(BranchCode));
+ continue;
+ }
+
+ // Handle subsequent conditional branches. Only handle the case where all
+ // conditional branches branch to the same destination and their condition
+ // opcodes fit one of the special multi-branch idioms.
+ assert(Cond.size() == 1);
+ assert(TBB);
+
+ // Only handle the case where all conditional branches branch to the same
+ // destination.
+ if (TBB != I->getOperand(0).getMBB())
+ return true;
+
+ // If the conditions are the same, we can leave them alone.
+ X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
+ if (OldBranchCode == BranchCode)
+ continue;
+
+ // If they differ, see if they fit one of the known patterns. Theoretically,
+ // we could handle more patterns here, but we shouldn't expect to see them
+ // if instruction selection has done a reasonable job.
+ if ((OldBranchCode == X86::COND_NP &&
+ BranchCode == X86::COND_E) ||
+ (OldBranchCode == X86::COND_E &&
+ BranchCode == X86::COND_NP))
+ BranchCode = X86::COND_NP_OR_E;
+ else if ((OldBranchCode == X86::COND_P &&
+ BranchCode == X86::COND_NE) ||
+ (OldBranchCode == X86::COND_NE &&
+ BranchCode == X86::COND_P))
+ BranchCode = X86::COND_NE_OR_P;
+ else
+ return true;
+
+ // Update the MachineOperand.
+ Cond[0].setImm(BranchCode);
+ }
+
+ return false;
+}
+
+unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator I = MBB.end();
+ unsigned Count = 0;
+
+ while (I != MBB.begin()) {
+ --I;
+ if (I->isDebugValue())
+ continue;
+ if (I->getOpcode() != X86::JMP_4 &&
+ GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+ break;
+ // Remove the branch.
+ I->eraseFromParent();
+ I = MBB.end();
+ ++Count;
+ }
+
+ return Count;
+}
+
+unsigned
+X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ const SmallVectorImpl<MachineOperand> &Cond) const {
+ // FIXME this should probably have a DebugLoc operand
+ DebugLoc dl;
+ // Shouldn't be a fall through.
+ assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 1 || Cond.size() == 0) &&
+ "X86 branch conditions have one component!");
+
+ if (Cond.empty()) {
+ // Unconditional branch?
+ assert(!FBB && "Unconditional branch with multiple successors!");
+ BuildMI(&MBB, dl, get(X86::JMP_4)).addMBB(TBB);
+ return 1;
+ }
+
+ // Conditional branch.
+ unsigned Count = 0;
+ X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
+ switch (CC) {
+ case X86::COND_NP_OR_E:
+ // Synthesize NP_OR_E with two branches.
+ BuildMI(&MBB, dl, get(X86::JNP_4)).addMBB(TBB);
+ ++Count;
+ BuildMI(&MBB, dl, get(X86::JE_4)).addMBB(TBB);
+ ++Count;
+ break;
+ case X86::COND_NE_OR_P:
+ // Synthesize NE_OR_P with two branches.
+ BuildMI(&MBB, dl, get(X86::JNE_4)).addMBB(TBB);
+ ++Count;
+ BuildMI(&MBB, dl, get(X86::JP_4)).addMBB(TBB);
+ ++Count;
+ break;
+ default: {
+ unsigned Opc = GetCondBranchFromCond(CC);
+ BuildMI(&MBB, dl, get(Opc)).addMBB(TBB);
+ ++Count;
+ }
+ }
+ if (FBB) {
+ // Two-way Conditional branch. Insert the second branch.
+ BuildMI(&MBB, dl, get(X86::JMP_4)).addMBB(FBB);
+ ++Count;
+ }
+ return Count;
+}
+
+/// isHReg - Test if the given register is a physical h register.
+static bool isHReg(unsigned Reg) {
+ return X86::GR8_ABCD_HRegClass.contains(Reg);
+}
+
+bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, unsigned SrcReg,
+ const TargetRegisterClass *DestRC,
+ const TargetRegisterClass *SrcRC,
+ DebugLoc DL) const {
+
+ // Determine if DstRC and SrcRC have a common superclass in common.
+ const TargetRegisterClass *CommonRC = DestRC;
+ if (DestRC == SrcRC)
+ /* Source and destination have the same register class. */;
+ else if (CommonRC->hasSuperClass(SrcRC))
+ CommonRC = SrcRC;
+ else if (!DestRC->hasSubClass(SrcRC)) {
+ // Neither of GR64_NOREX or GR64_NOSP is a superclass of the other,
+ // but we want to copy them as GR64. Similarly, for GR32_NOREX and
+ // GR32_NOSP, copy as GR32.
+ if (SrcRC->hasSuperClass(&X86::GR64RegClass) &&
+ DestRC->hasSuperClass(&X86::GR64RegClass))
+ CommonRC = &X86::GR64RegClass;
+ else if (SrcRC->hasSuperClass(&X86::GR32RegClass) &&
+ DestRC->hasSuperClass(&X86::GR32RegClass))
+ CommonRC = &X86::GR32RegClass;
+ else
+ CommonRC = 0;
+ }
+
+ if (CommonRC) {
+ unsigned Opc;
+ if (CommonRC == &X86::GR64RegClass || CommonRC == &X86::GR64_NOSPRegClass) {
+ Opc = X86::MOV64rr;
+ } else if (CommonRC == &X86::GR32RegClass ||
+ CommonRC == &X86::GR32_NOSPRegClass) {
+ Opc = X86::MOV32rr;
+ } else if (CommonRC == &X86::GR16RegClass) {
+ Opc = X86::MOV16rr;
+ } else if (CommonRC == &X86::GR8RegClass) {
+ // Copying to or from a physical H register on x86-64 requires a NOREX
+ // move. Otherwise use a normal move.
+ if ((isHReg(DestReg) || isHReg(SrcReg)) &&
+ TM.getSubtarget<X86Subtarget>().is64Bit())
+ Opc = X86::MOV8rr_NOREX;
+ else
+ Opc = X86::MOV8rr;
+ } else if (CommonRC == &X86::GR64_ABCDRegClass) {
+ Opc = X86::MOV64rr;
+ } else if (CommonRC == &X86::GR32_ABCDRegClass) {
+ Opc = X86::MOV32rr;
+ } else if (CommonRC == &X86::GR16_ABCDRegClass) {
+ Opc = X86::MOV16rr;
+ } else if (CommonRC == &X86::GR8_ABCD_LRegClass) {
+ Opc = X86::MOV8rr;
+ } else if (CommonRC == &X86::GR8_ABCD_HRegClass) {
+ if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ Opc = X86::MOV8rr_NOREX;
+ else
+ Opc = X86::MOV8rr;
+ } else if (CommonRC == &X86::GR64_NOREXRegClass ||
+ CommonRC == &X86::GR64_NOREX_NOSPRegClass) {
+ Opc = X86::MOV64rr;
+ } else if (CommonRC == &X86::GR32_NOREXRegClass) {
+ Opc = X86::MOV32rr;
+ } else if (CommonRC == &X86::GR16_NOREXRegClass) {
+ Opc = X86::MOV16rr;
+ } else if (CommonRC == &X86::GR8_NOREXRegClass) {
+ Opc = X86::MOV8rr;
+ } else if (CommonRC == &X86::GR64_TCRegClass) {
+ Opc = X86::MOV64rr_TC;
+ } else if (CommonRC == &X86::GR32_TCRegClass) {
+ Opc = X86::MOV32rr_TC;
+ } else if (CommonRC == &X86::RFP32RegClass) {
+ Opc = X86::MOV_Fp3232;
+ } else if (CommonRC == &X86::RFP64RegClass || CommonRC == &X86::RSTRegClass) {
+ Opc = X86::MOV_Fp6464;
+ } else if (CommonRC == &X86::RFP80RegClass) {
+ Opc = X86::MOV_Fp8080;
+ } else if (CommonRC == &X86::FR32RegClass) {
+ Opc = X86::FsMOVAPSrr;
+ } else if (CommonRC == &X86::FR64RegClass) {
+ Opc = X86::FsMOVAPDrr;
+ } else if (CommonRC == &X86::VR128RegClass) {
+ Opc = X86::MOVAPSrr;
+ } else if (CommonRC == &X86::VR64RegClass) {
+ Opc = X86::MMX_MOVQ64rr;
+ } else {
+ return false;
+ }
+ BuildMI(MBB, MI, DL, get(Opc), DestReg).addReg(SrcReg);
+ return true;
+ }
+
+ // Moving EFLAGS to / from another register requires a push and a pop.
+ if (SrcRC == &X86::CCRRegClass) {
+ if (SrcReg != X86::EFLAGS)
+ return false;
+ if (DestRC == &X86::GR64RegClass || DestRC == &X86::GR64_NOSPRegClass) {
+ BuildMI(MBB, MI, DL, get(X86::PUSHF64));
+ BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
+ return true;
+ } else if (DestRC == &X86::GR32RegClass ||
+ DestRC == &X86::GR32_NOSPRegClass) {
+ BuildMI(MBB, MI, DL, get(X86::PUSHF32));
+ BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
+ return true;
+ }
+ } else if (DestRC == &X86::CCRRegClass) {
+ if (DestReg != X86::EFLAGS)
+ return false;
+ if (SrcRC == &X86::GR64RegClass || DestRC == &X86::GR64_NOSPRegClass) {
+ BuildMI(MBB, MI, DL, get(X86::PUSH64r)).addReg(SrcReg);
+ BuildMI(MBB, MI, DL, get(X86::POPF64));
+ return true;
+ } else if (SrcRC == &X86::GR32RegClass ||
+ DestRC == &X86::GR32_NOSPRegClass) {
+ BuildMI(MBB, MI, DL, get(X86::PUSH32r)).addReg(SrcReg);
+ BuildMI(MBB, MI, DL, get(X86::POPF32));
+ return true;
+ }
+ }
+
+ // Moving from ST(0) turns into FpGET_ST0_32 etc.
+ if (SrcRC == &X86::RSTRegClass) {
+ // Copying from ST(0)/ST(1).
+ if (SrcReg != X86::ST0 && SrcReg != X86::ST1)
+ // Can only copy from ST(0)/ST(1) right now
+ return false;
+ bool isST0 = SrcReg == X86::ST0;
+ unsigned Opc;
+ if (DestRC == &X86::RFP32RegClass)
+ Opc = isST0 ? X86::FpGET_ST0_32 : X86::FpGET_ST1_32;
+ else if (DestRC == &X86::RFP64RegClass)
+ Opc = isST0 ? X86::FpGET_ST0_64 : X86::FpGET_ST1_64;
+ else {
+ if (DestRC != &X86::RFP80RegClass)
+ return false;
+ Opc = isST0 ? X86::FpGET_ST0_80 : X86::FpGET_ST1_80;
+ }
+ BuildMI(MBB, MI, DL, get(Opc), DestReg);
+ return true;
+ }
+
+ // Moving to ST(0) turns into FpSET_ST0_32 etc.
+ if (DestRC == &X86::RSTRegClass) {
+ // Copying to ST(0) / ST(1).
+ if (DestReg != X86::ST0 && DestReg != X86::ST1)
+ // Can only copy to TOS right now
+ return false;
+ bool isST0 = DestReg == X86::ST0;
+ unsigned Opc;
+ if (SrcRC == &X86::RFP32RegClass)
+ Opc = isST0 ? X86::FpSET_ST0_32 : X86::FpSET_ST1_32;
+ else if (SrcRC == &X86::RFP64RegClass)
+ Opc = isST0 ? X86::FpSET_ST0_64 : X86::FpSET_ST1_64;
+ else {
+ if (SrcRC != &X86::RFP80RegClass)
+ return false;
+ Opc = isST0 ? X86::FpSET_ST0_80 : X86::FpSET_ST1_80;
+ }
+ BuildMI(MBB, MI, DL, get(Opc)).addReg(SrcReg);
+ return true;
+ }
+
+ // Not yet supported!
+ return false;
+}
+
+static unsigned getStoreRegOpcode(unsigned SrcReg,
+ const TargetRegisterClass *RC,
+ bool isStackAligned,
+ TargetMachine &TM) {
+ unsigned Opc = 0;
+ if (RC == &X86::GR64RegClass || RC == &X86::GR64_NOSPRegClass) {
+ Opc = X86::MOV64mr;
+ } else if (RC == &X86::GR32RegClass || RC == &X86::GR32_NOSPRegClass) {
+ Opc = X86::MOV32mr;
+ } else if (RC == &X86::GR16RegClass) {
+ Opc = X86::MOV16mr;
+ } else if (RC == &X86::GR8RegClass) {
+ // Copying to or from a physical H register on x86-64 requires a NOREX
+ // move. Otherwise use a normal move.
+ if (isHReg(SrcReg) &&
+ TM.getSubtarget<X86Subtarget>().is64Bit())
+ Opc = X86::MOV8mr_NOREX;
+ else
+ Opc = X86::MOV8mr;
+ } else if (RC == &X86::GR64_ABCDRegClass) {
+ Opc = X86::MOV64mr;
+ } else if (RC == &X86::GR32_ABCDRegClass) {
+ Opc = X86::MOV32mr;
+ } else if (RC == &X86::GR16_ABCDRegClass) {
+ Opc = X86::MOV16mr;
+ } else if (RC == &X86::GR8_ABCD_LRegClass) {
+ Opc = X86::MOV8mr;
+ } else if (RC == &X86::GR8_ABCD_HRegClass) {
+ if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ Opc = X86::MOV8mr_NOREX;
+ else
+ Opc = X86::MOV8mr;
+ } else if (RC == &X86::GR64_NOREXRegClass ||
+ RC == &X86::GR64_NOREX_NOSPRegClass) {
+ Opc = X86::MOV64mr;
+ } else if (RC == &X86::GR32_NOREXRegClass) {
+ Opc = X86::MOV32mr;
+ } else if (RC == &X86::GR16_NOREXRegClass) {
+ Opc = X86::MOV16mr;
+ } else if (RC == &X86::GR8_NOREXRegClass) {
+ Opc = X86::MOV8mr;
+ } else if (RC == &X86::GR64_TCRegClass) {
+ Opc = X86::MOV64mr_TC;
+ } else if (RC == &X86::GR32_TCRegClass) {
+ Opc = X86::MOV32mr_TC;
+ } else if (RC == &X86::RFP80RegClass) {
+ Opc = X86::ST_FpP80m; // pops
+ } else if (RC == &X86::RFP64RegClass) {
+ Opc = X86::ST_Fp64m;
+ } else if (RC == &X86::RFP32RegClass) {
+ Opc = X86::ST_Fp32m;
+ } else if (RC == &X86::FR32RegClass) {
+ Opc = X86::MOVSSmr;
+ } else if (RC == &X86::FR64RegClass) {
+ Opc = X86::MOVSDmr;
+ } else if (RC == &X86::VR128RegClass) {
+ // If stack is realigned we can use aligned stores.
+ Opc = isStackAligned ? X86::MOVAPSmr : X86::MOVUPSmr;
+ } else if (RC == &X86::VR64RegClass) {
+ Opc = X86::MMX_MOVQ64mr;
+ } else {
+ llvm_unreachable("Unknown regclass");
+ }
+
+ return Opc;
+}
+
+void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ const MachineFunction &MF = *MBB.getParent();
+ bool isAligned = (RI.getStackAlignment() >= 16) || RI.canRealignStack(MF);
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
+ .addReg(SrcReg, getKillRegState(isKill));
+}
+
+void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+ bool isKill,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const {
+ bool isAligned = (*MMOBegin)->getAlignment() >= 16;
+ unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+ DebugLoc DL;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+ for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+ MIB.addOperand(Addr[i]);
+ MIB.addReg(SrcReg, getKillRegState(isKill));
+ (*MIB).setMemRefs(MMOBegin, MMOEnd);
+ NewMIs.push_back(MIB);
+}
+
+static unsigned getLoadRegOpcode(unsigned DestReg,
+ const TargetRegisterClass *RC,
+ bool isStackAligned,
+ const TargetMachine &TM) {
+ unsigned Opc = 0;
+ if (RC == &X86::GR64RegClass || RC == &X86::GR64_NOSPRegClass) {
+ Opc = X86::MOV64rm;
+ } else if (RC == &X86::GR32RegClass || RC == &X86::GR32_NOSPRegClass) {
+ Opc = X86::MOV32rm;
+ } else if (RC == &X86::GR16RegClass) {
+ Opc = X86::MOV16rm;
+ } else if (RC == &X86::GR8RegClass) {
+ // Copying to or from a physical H register on x86-64 requires a NOREX
+ // move. Otherwise use a normal move.
+ if (isHReg(DestReg) &&
+ TM.getSubtarget<X86Subtarget>().is64Bit())
+ Opc = X86::MOV8rm_NOREX;
+ else
+ Opc = X86::MOV8rm;
+ } else if (RC == &X86::GR64_ABCDRegClass) {
+ Opc = X86::MOV64rm;
+ } else if (RC == &X86::GR32_ABCDRegClass) {
+ Opc = X86::MOV32rm;
+ } else if (RC == &X86::GR16_ABCDRegClass) {
+ Opc = X86::MOV16rm;
+ } else if (RC == &X86::GR8_ABCD_LRegClass) {
+ Opc = X86::MOV8rm;
+ } else if (RC == &X86::GR8_ABCD_HRegClass) {
+ if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ Opc = X86::MOV8rm_NOREX;
+ else
+ Opc = X86::MOV8rm;
+ } else if (RC == &X86::GR64_NOREXRegClass ||
+ RC == &X86::GR64_NOREX_NOSPRegClass) {
+ Opc = X86::MOV64rm;
+ } else if (RC == &X86::GR32_NOREXRegClass) {
+ Opc = X86::MOV32rm;
+ } else if (RC == &X86::GR16_NOREXRegClass) {
+ Opc = X86::MOV16rm;
+ } else if (RC == &X86::GR8_NOREXRegClass) {
+ Opc = X86::MOV8rm;
+ } else if (RC == &X86::GR64_TCRegClass) {
+ Opc = X86::MOV64rm_TC;
+ } else if (RC == &X86::GR32_TCRegClass) {
+ Opc = X86::MOV32rm_TC;
+ } else if (RC == &X86::RFP80RegClass) {
+ Opc = X86::LD_Fp80m;
+ } else if (RC == &X86::RFP64RegClass) {
+ Opc = X86::LD_Fp64m;
+ } else if (RC == &X86::RFP32RegClass) {
+ Opc = X86::LD_Fp32m;
+ } else if (RC == &X86::FR32RegClass) {
+ Opc = X86::MOVSSrm;
+ } else if (RC == &X86::FR64RegClass) {
+ Opc = X86::MOVSDrm;
+ } else if (RC == &X86::VR128RegClass) {
+ // If stack is realigned we can use aligned loads.
+ Opc = isStackAligned ? X86::MOVAPSrm : X86::MOVUPSrm;
+ } else if (RC == &X86::VR64RegClass) {
+ Opc = X86::MMX_MOVQ64rm;
+ } else {
+ llvm_unreachable("Unknown regclass");
+ }
+
+ return Opc;
+}
+
+void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIdx,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const {
+ const MachineFunction &MF = *MBB.getParent();
+ bool isAligned = (RI.getStackAlignment() >= 16) || RI.canRealignStack(MF);
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
+}
+
+void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const {
+ bool isAligned = (*MMOBegin)->getAlignment() >= 16;
+ unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+ DebugLoc DL;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+ for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+ MIB.addOperand(Addr[i]);
+ (*MIB).setMemRefs(MMOBegin, MMOEnd);
+ NewMIs.push_back(MIB);
+}
+
+bool X86InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return false;
+
+ DebugLoc DL = MBB.findDebugLoc(MI);
+
+ bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+ bool isWin64 = TM.getSubtarget<X86Subtarget>().isTargetWin64();
+ unsigned SlotSize = is64Bit ? 8 : 4;
+
+ MachineFunction &MF = *MBB.getParent();
+ unsigned FPReg = RI.getFrameRegister(MF);
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned CalleeFrameSize = 0;
+
+ unsigned Opc = is64Bit ? X86::PUSH64r : X86::PUSH32r;
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i-1].getReg();
+ const TargetRegisterClass *RegClass = CSI[i-1].getRegClass();
+ // Add the callee-saved register as live-in. It's killed at the spill.
+ MBB.addLiveIn(Reg);
+ if (Reg == FPReg)
+ // X86RegisterInfo::emitPrologue will handle spilling of frame register.
+ continue;
+ if (RegClass != &X86::VR128RegClass && !isWin64) {
+ CalleeFrameSize += SlotSize;
+ BuildMI(MBB, MI, DL, get(Opc)).addReg(Reg, RegState::Kill);
+ } else {
+ storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), RegClass,
+ &RI);
+ }
+ }
+
+ X86FI->setCalleeSavedFrameSize(CalleeFrameSize);
+ return true;
+}
+
+bool X86InstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const {
+ if (CSI.empty())
+ return false;
+
+ DebugLoc DL = MBB.findDebugLoc(MI);
+
+ MachineFunction &MF = *MBB.getParent();
+ unsigned FPReg = RI.getFrameRegister(MF);
+ bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+ bool isWin64 = TM.getSubtarget<X86Subtarget>().isTargetWin64();
+ unsigned Opc = is64Bit ? X86::POP64r : X86::POP32r;
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ if (Reg == FPReg)
+ // X86RegisterInfo::emitEpilogue will handle restoring of frame register.
+ continue;
+ const TargetRegisterClass *RegClass = CSI[i].getRegClass();
+ if (RegClass != &X86::VR128RegClass && !isWin64) {
+ BuildMI(MBB, MI, DL, get(Opc), Reg);
+ } else {
+ loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RegClass, &RI);
+ }
+ }
+ return true;
+}
+
+MachineInstr*
+X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
+ int FrameIx, uint64_t Offset,
+ const MDNode *MDPtr,
+ DebugLoc DL) const {
+ X86AddressMode AM;
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = FrameIx;
+ MachineInstrBuilder MIB = BuildMI(MF, DL, get(X86::DBG_VALUE));
+ addFullAddress(MIB, AM).addImm(Offset).addMetadata(MDPtr);
+ return &*MIB;
+}
+
+static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
+ const SmallVectorImpl<MachineOperand> &MOs,
+ MachineInstr *MI,
+ const TargetInstrInfo &TII) {
+ // Create the base instruction with the memory operand as the first part.
+ MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+ MI->getDebugLoc(), true);
+ MachineInstrBuilder MIB(NewMI);
+ unsigned NumAddrOps = MOs.size();
+ for (unsigned i = 0; i != NumAddrOps; ++i)
+ MIB.addOperand(MOs[i]);
+ if (NumAddrOps < 4) // FrameIndex only
+ addOffset(MIB, 0);
+
+ // Loop over the rest of the ri operands, converting them over.
+ unsigned NumOps = MI->getDesc().getNumOperands()-2;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ MachineOperand &MO = MI->getOperand(i+2);
+ MIB.addOperand(MO);
+ }
+ for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ MIB.addOperand(MO);
+ }
+ return MIB;
+}
+
+static MachineInstr *FuseInst(MachineFunction &MF,
+ unsigned Opcode, unsigned OpNo,
+ const SmallVectorImpl<MachineOperand> &MOs,
+ MachineInstr *MI, const TargetInstrInfo &TII) {
+ MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+ MI->getDebugLoc(), true);
+ MachineInstrBuilder MIB(NewMI);
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (i == OpNo) {
+ assert(MO.isReg() && "Expected to fold into reg operand!");
+ unsigned NumAddrOps = MOs.size();
+ for (unsigned i = 0; i != NumAddrOps; ++i)
+ MIB.addOperand(MOs[i]);
+ if (NumAddrOps < 4) // FrameIndex only
+ addOffset(MIB, 0);
+ } else {
+ MIB.addOperand(MO);
+ }
+ }
+ return MIB;
+}
+
+static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
+ const SmallVectorImpl<MachineOperand> &MOs,
+ MachineInstr *MI) {
+ MachineFunction &MF = *MI->getParent()->getParent();
+ MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode));
+
+ unsigned NumAddrOps = MOs.size();
+ for (unsigned i = 0; i != NumAddrOps; ++i)
+ MIB.addOperand(MOs[i]);
+ if (NumAddrOps < 4) // FrameIndex only
+ addOffset(MIB, 0);
+ return MIB.addImm(0);
+}
+
+MachineInstr*
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr *MI, unsigned i,
+ const SmallVectorImpl<MachineOperand> &MOs,
+ unsigned Size, unsigned Align) const {
+ const DenseMap<unsigned*, std::pair<unsigned,unsigned> > *OpcodeTablePtr=NULL;
+ bool isTwoAddrFold = false;
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ bool isTwoAddr = NumOps > 1 &&
+ MI->getDesc().getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+ MachineInstr *NewMI = NULL;
+ // Folding a memory location into the two-address part of a two-address
+ // instruction is different than folding it other places. It requires
+ // replacing the *two* registers with the memory location.
+ if (isTwoAddr && NumOps >= 2 && i < 2 &&
+ MI->getOperand(0).isReg() &&
+ MI->getOperand(1).isReg() &&
+ MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
+ OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+ isTwoAddrFold = true;
+ } else if (i == 0) { // If operand 0
+ if (MI->getOpcode() == X86::MOV64r0)
+ NewMI = MakeM0Inst(*this, X86::MOV64mi32, MOs, MI);
+ else if (MI->getOpcode() == X86::MOV32r0)
+ NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
+ else if (MI->getOpcode() == X86::MOV16r0)
+ NewMI = MakeM0Inst(*this, X86::MOV16mi, MOs, MI);
+ else if (MI->getOpcode() == X86::MOV8r0)
+ NewMI = MakeM0Inst(*this, X86::MOV8mi, MOs, MI);
+ if (NewMI)
+ return NewMI;
+
+ OpcodeTablePtr = &RegOp2MemOpTable0;
+ } else if (i == 1) {
+ OpcodeTablePtr = &RegOp2MemOpTable1;
+ } else if (i == 2) {
+ OpcodeTablePtr = &RegOp2MemOpTable2;
+ }
+
+ // If table selected...
+ if (OpcodeTablePtr) {
+ // Find the Opcode to fuse
+ DenseMap<unsigned*, std::pair<unsigned,unsigned> >::const_iterator I =
+ OpcodeTablePtr->find((unsigned*)MI->getOpcode());
+ if (I != OpcodeTablePtr->end()) {
+ unsigned Opcode = I->second.first;
+ unsigned MinAlign = I->second.second;
+ if (Align < MinAlign)
+ return NULL;
+ bool NarrowToMOV32rm = false;
+ if (Size) {
+ unsigned RCSize = MI->getDesc().OpInfo[i].getRegClass(&RI)->getSize();
+ if (Size < RCSize) {
+ // Check if it's safe to fold the load. If the size of the object is
+ // narrower than the load width, then it's not.
+ if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+ return NULL;
+ // If this is a 64-bit load, but the spill slot is 32, then we can do
+ // a 32-bit load which is implicitly zero-extended. This likely is due
+ // to liveintervalanalysis remat'ing a load from stack slot.
+ if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
+ return NULL;
+ Opcode = X86::MOV32rm;
+ NarrowToMOV32rm = true;
+ }
+ }
+
+ if (isTwoAddrFold)
+ NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this);
+ else
+ NewMI = FuseInst(MF, Opcode, i, MOs, MI, *this);
+
+ if (NarrowToMOV32rm) {
+ // If this is the special case where we use a MOV32rm to load a 32-bit
+ // value and zero-extend the top bits. Change the destination register
+ // to a 32-bit one.
+ unsigned DstReg = NewMI->getOperand(0).getReg();
+ if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ NewMI->getOperand(0).setReg(RI.getSubReg(DstReg,
+ X86::sub_32bit));
+ else
+ NewMI->getOperand(0).setSubReg(X86::sub_32bit);
+ }
+ return NewMI;
+ }
+ }
+
+ // No fusion
+ if (PrintFailedFusing)
+ dbgs() << "We failed to fuse operand " << i << " in " << *MI;
+ return NULL;
+}
+
+
+MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr *MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ int FrameIndex) const {
+ // Check switch flag
+ if (NoFusing) return NULL;
+
+ if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+ switch (MI->getOpcode()) {
+ case X86::CVTSD2SSrr:
+ case X86::Int_CVTSD2SSrr:
+ case X86::CVTSS2SDrr:
+ case X86::Int_CVTSS2SDrr:
+ case X86::RCPSSr:
+ case X86::RCPSSr_Int:
+ case X86::ROUNDSDr_Int:
+ case X86::ROUNDSSr_Int:
+ case X86::RSQRTSSr:
+ case X86::RSQRTSSr_Int:
+ case X86::SQRTSSr:
+ case X86::SQRTSSr_Int:
+ return 0;
+ }
+
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ unsigned Size = MFI->getObjectSize(FrameIndex);
+ unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ unsigned NewOpc = 0;
+ unsigned RCSize = 0;
+ switch (MI->getOpcode()) {
+ default: return NULL;
+ case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break;
+ case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
+ case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
+ case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break;
+ }
+ // Check if it's safe to fold the load. If the size of the object is
+ // narrower than the load width, then it's not.
+ if (Size < RCSize)
+ return NULL;
+ // Change to CMPXXri r, 0 first.
+ MI->setDesc(get(NewOpc));
+ MI->getOperand(1).ChangeToImmediate(0);
+ } else if (Ops.size() != 1)
+ return NULL;
+
+ SmallVector<MachineOperand,4> MOs;
+ MOs.push_back(MachineOperand::CreateFI(FrameIndex));
+ return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment);
+}
+
+MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr *MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ MachineInstr *LoadMI) const {
+ // Check switch flag
+ if (NoFusing) return NULL;
+
+ if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+ switch (MI->getOpcode()) {
+ case X86::CVTSD2SSrr:
+ case X86::Int_CVTSD2SSrr:
+ case X86::CVTSS2SDrr:
+ case X86::Int_CVTSS2SDrr:
+ case X86::RCPSSr:
+ case X86::RCPSSr_Int:
+ case X86::ROUNDSDr_Int:
+ case X86::ROUNDSSr_Int:
+ case X86::RSQRTSSr:
+ case X86::RSQRTSSr_Int:
+ case X86::SQRTSSr:
+ case X86::SQRTSSr_Int:
+ return 0;
+ }
+
+ // Determine the alignment of the load.
+ unsigned Alignment = 0;
+ if (LoadMI->hasOneMemOperand())
+ Alignment = (*LoadMI->memoperands_begin())->getAlignment();
+ else
+ switch (LoadMI->getOpcode()) {
+ case X86::V_SET0PS:
+ case X86::V_SET0PD:
+ case X86::V_SET0PI:
+ case X86::V_SETALLONES:
+ Alignment = 16;
+ break;
+ case X86::FsFLD0SD:
+ Alignment = 8;
+ break;
+ case X86::FsFLD0SS:
+ Alignment = 4;
+ break;
+ default:
+ llvm_unreachable("Don't know how to fold this instruction!");
+ }
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ unsigned NewOpc = 0;
+ switch (MI->getOpcode()) {
+ default: return NULL;
+ case X86::TEST8rr: NewOpc = X86::CMP8ri; break;
+ case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
+ case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
+ case X86::TEST64rr: NewOpc = X86::CMP64ri8; break;
+ }
+ // Change to CMPXXri r, 0 first.
+ MI->setDesc(get(NewOpc));
+ MI->getOperand(1).ChangeToImmediate(0);
+ } else if (Ops.size() != 1)
+ return NULL;
+
+ SmallVector<MachineOperand,X86AddrNumOperands> MOs;
+ switch (LoadMI->getOpcode()) {
+ case X86::V_SET0PS:
+ case X86::V_SET0PD:
+ case X86::V_SET0PI:
+ case X86::V_SETALLONES:
+ case X86::FsFLD0SD:
+ case X86::FsFLD0SS: {
+ // Folding a V_SET0P? or V_SETALLONES as a load, to ease register pressure.
+ // Create a constant-pool entry and operands to load from it.
+
+ // Medium and large mode can't fold loads this way.
+ if (TM.getCodeModel() != CodeModel::Small &&
+ TM.getCodeModel() != CodeModel::Kernel)
+ return NULL;
+
+ // x86-32 PIC requires a PIC base register for constant pools.
+ unsigned PICBase = 0;
+ if (TM.getRelocationModel() == Reloc::PIC_) {
+ if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ PICBase = X86::RIP;
+ else
+ // FIXME: PICBase = TM.getInstrInfo()->getGlobalBaseReg(&MF);
+ // This doesn't work for several reasons.
+ // 1. GlobalBaseReg may have been spilled.
+ // 2. It may not be live at MI.
+ return NULL;
+ }
+
+ // Create a constant-pool entry.
+ MachineConstantPool &MCP = *MF.getConstantPool();
+ const Type *Ty;
+ if (LoadMI->getOpcode() == X86::FsFLD0SS)
+ Ty = Type::getFloatTy(MF.getFunction()->getContext());
+ else if (LoadMI->getOpcode() == X86::FsFLD0SD)
+ Ty = Type::getDoubleTy(MF.getFunction()->getContext());
+ else
+ Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
+ const Constant *C = LoadMI->getOpcode() == X86::V_SETALLONES ?
+ Constant::getAllOnesValue(Ty) :
+ Constant::getNullValue(Ty);
+ unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
+
+ // Create operands to load from the constant pool entry.
+ MOs.push_back(MachineOperand::CreateReg(PICBase, false));
+ MOs.push_back(MachineOperand::CreateImm(1));
+ MOs.push_back(MachineOperand::CreateReg(0, false));
+ MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
+ MOs.push_back(MachineOperand::CreateReg(0, false));
+ break;
+ }
+ default: {
+ // Folding a normal load. Just copy the load's address operands.
+ unsigned NumOps = LoadMI->getDesc().getNumOperands();
+ for (unsigned i = NumOps - X86AddrNumOperands; i != NumOps; ++i)
+ MOs.push_back(LoadMI->getOperand(i));
+ break;
+ }
+ }
+ return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment);
+}
+
+
+bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+ const SmallVectorImpl<unsigned> &Ops) const {
+ // Check switch flag
+ if (NoFusing) return 0;
+
+ if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+ switch (MI->getOpcode()) {
+ default: return false;
+ case X86::TEST8rr:
+ case X86::TEST16rr:
+ case X86::TEST32rr:
+ case X86::TEST64rr:
+ return true;
+ }
+ }
+
+ if (Ops.size() != 1)
+ return false;
+
+ unsigned OpNum = Ops[0];
+ unsigned Opc = MI->getOpcode();
+ unsigned NumOps = MI->getDesc().getNumOperands();
+ bool isTwoAddr = NumOps > 1 &&
+ MI->getDesc().getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+ // Folding a memory location into the two-address part of a two-address
+ // instruction is different than folding it other places. It requires
+ // replacing the *two* registers with the memory location.
+ const DenseMap<unsigned*, std::pair<unsigned,unsigned> > *OpcodeTablePtr=NULL;
+ if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
+ OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+ } else if (OpNum == 0) { // If operand 0
+ switch (Opc) {
+ case X86::MOV8r0:
+ case X86::MOV16r0:
+ case X86::MOV32r0:
+ case X86::MOV64r0:
+ return true;
+ default: break;
+ }
+ OpcodeTablePtr = &RegOp2MemOpTable0;
+ } else if (OpNum == 1) {
+ OpcodeTablePtr = &RegOp2MemOpTable1;
+ } else if (OpNum == 2) {
+ OpcodeTablePtr = &RegOp2MemOpTable2;
+ }
+
+ if (OpcodeTablePtr) {
+ // Find the Opcode to fuse
+ DenseMap<unsigned*, std::pair<unsigned,unsigned> >::const_iterator I =
+ OpcodeTablePtr->find((unsigned*)Opc);
+ if (I != OpcodeTablePtr->end())
+ return true;
+ }
+ return false;
+}
+
+bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+ unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const {
+ DenseMap<unsigned*, std::pair<unsigned,unsigned> >::const_iterator I =
+ MemOp2RegOpTable.find((unsigned*)MI->getOpcode());
+ if (I == MemOp2RegOpTable.end())
+ return false;
+ unsigned Opc = I->second.first;
+ unsigned Index = I->second.second & 0xf;
+ bool FoldedLoad = I->second.second & (1 << 4);
+ bool FoldedStore = I->second.second & (1 << 5);
+ if (UnfoldLoad && !FoldedLoad)
+ return false;
+ UnfoldLoad &= FoldedLoad;
+ if (UnfoldStore && !FoldedStore)
+ return false;
+ UnfoldStore &= FoldedStore;
+
+ const TargetInstrDesc &TID = get(Opc);
+ const TargetOperandInfo &TOI = TID.OpInfo[Index];
+ const TargetRegisterClass *RC = TOI.getRegClass(&RI);
+ SmallVector<MachineOperand, X86AddrNumOperands> AddrOps;
+ SmallVector<MachineOperand,2> BeforeOps;
+ SmallVector<MachineOperand,2> AfterOps;
+ SmallVector<MachineOperand,4> ImpOps;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (i >= Index && i < Index + X86AddrNumOperands)
+ AddrOps.push_back(Op);
+ else if (Op.isReg() && Op.isImplicit())
+ ImpOps.push_back(Op);
+ else if (i < Index)
+ BeforeOps.push_back(Op);
+ else if (i > Index)
+ AfterOps.push_back(Op);
+ }
+
+ // Emit the load instruction.
+ if (UnfoldLoad) {
+ std::pair<MachineInstr::mmo_iterator,
+ MachineInstr::mmo_iterator> MMOs =
+ MF.extractLoadMemRefs(MI->memoperands_begin(),
+ MI->memoperands_end());
+ loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
+ if (UnfoldStore) {
+ // Address operands cannot be marked isKill.
+ for (unsigned i = 1; i != 1 + X86AddrNumOperands; ++i) {
+ MachineOperand &MO = NewMIs[0]->getOperand(i);
+ if (MO.isReg())
+ MO.setIsKill(false);
+ }
+ }
+ }
+
+ // Emit the data processing instruction.
+ MachineInstr *DataMI = MF.CreateMachineInstr(TID, MI->getDebugLoc(), true);
+ MachineInstrBuilder MIB(DataMI);
+
+ if (FoldedStore)
+ MIB.addReg(Reg, RegState::Define);
+ for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i)
+ MIB.addOperand(BeforeOps[i]);
+ if (FoldedLoad)
+ MIB.addReg(Reg);
+ for (unsigned i = 0, e = AfterOps.size(); i != e; ++i)
+ MIB.addOperand(AfterOps[i]);
+ for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) {
+ MachineOperand &MO = ImpOps[i];
+ MIB.addReg(MO.getReg(),
+ getDefRegState(MO.isDef()) |
+ RegState::Implicit |
+ getKillRegState(MO.isKill()) |
+ getDeadRegState(MO.isDead()) |
+ getUndefRegState(MO.isUndef()));
+ }
+ // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
+ unsigned NewOpc = 0;
+ switch (DataMI->getOpcode()) {
+ default: break;
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP8ri: {
+ MachineOperand &MO0 = DataMI->getOperand(0);
+ MachineOperand &MO1 = DataMI->getOperand(1);
+ if (MO1.getImm() == 0) {
+ switch (DataMI->getOpcode()) {
+ default: break;
+ case X86::CMP64ri8:
+ case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
+ case X86::CMP32ri8:
+ case X86::CMP32ri: NewOpc = X86::TEST32rr; break;
+ case X86::CMP16ri8:
+ case X86::CMP16ri: NewOpc = X86::TEST16rr; break;
+ case X86::CMP8ri: NewOpc = X86::TEST8rr; break;
+ }
+ DataMI->setDesc(get(NewOpc));
+ MO1.ChangeToRegister(MO0.getReg(), false);
+ }
+ }
+ }
+ NewMIs.push_back(DataMI);
+
+ // Emit the store instruction.
+ if (UnfoldStore) {
+ const TargetRegisterClass *DstRC = TID.OpInfo[0].getRegClass(&RI);
+ std::pair<MachineInstr::mmo_iterator,
+ MachineInstr::mmo_iterator> MMOs =
+ MF.extractStoreMemRefs(MI->memoperands_begin(),
+ MI->memoperands_end());
+ storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs);
+ }
+
+ return true;
+}
+
+bool
+X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+ SmallVectorImpl<SDNode*> &NewNodes) const {
+ if (!N->isMachineOpcode())
+ return false;
+
+ DenseMap<unsigned*, std::pair<unsigned,unsigned> >::const_iterator I =
+ MemOp2RegOpTable.find((unsigned*)N->getMachineOpcode());
+ if (I == MemOp2RegOpTable.end())
+ return false;
+ unsigned Opc = I->second.first;
+ unsigned Index = I->second.second & 0xf;
+ bool FoldedLoad = I->second.second & (1 << 4);
+ bool FoldedStore = I->second.second & (1 << 5);
+ const TargetInstrDesc &TID = get(Opc);
+ const TargetRegisterClass *RC = TID.OpInfo[Index].getRegClass(&RI);
+ unsigned NumDefs = TID.NumDefs;
+ std::vector<SDValue> AddrOps;
+ std::vector<SDValue> BeforeOps;
+ std::vector<SDValue> AfterOps;
+ DebugLoc dl = N->getDebugLoc();
+ unsigned NumOps = N->getNumOperands();
+ for (unsigned i = 0; i != NumOps-1; ++i) {
+ SDValue Op = N->getOperand(i);
+ if (i >= Index-NumDefs && i < Index-NumDefs + X86AddrNumOperands)
+ AddrOps.push_back(Op);
+ else if (i < Index-NumDefs)
+ BeforeOps.push_back(Op);
+ else if (i > Index-NumDefs)
+ AfterOps.push_back(Op);
+ }
+ SDValue Chain = N->getOperand(NumOps-1);
+ AddrOps.push_back(Chain);
+
+ // Emit the load instruction.
+ SDNode *Load = 0;
+ MachineFunction &MF = DAG.getMachineFunction();
+ if (FoldedLoad) {
+ EVT VT = *RC->vt_begin();
+ std::pair<MachineInstr::mmo_iterator,
+ MachineInstr::mmo_iterator> MMOs =
+ MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+ cast<MachineSDNode>(N)->memoperands_end());
+ bool isAligned = (*MMOs.first)->getAlignment() >= 16;
+ Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
+ VT, MVT::Other, &AddrOps[0], AddrOps.size());
+ NewNodes.push_back(Load);
+
+ // Preserve memory reference information.
+ cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+ }
+
+ // Emit the data processing instruction.
+ std::vector<EVT> VTs;
+ const TargetRegisterClass *DstRC = 0;
+ if (TID.getNumDefs() > 0) {
+ DstRC = TID.OpInfo[0].getRegClass(&RI);
+ VTs.push_back(*DstRC->vt_begin());
+ }
+ for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+ EVT VT = N->getValueType(i);
+ if (VT != MVT::Other && i >= (unsigned)TID.getNumDefs())
+ VTs.push_back(VT);
+ }
+ if (Load)
+ BeforeOps.push_back(SDValue(Load, 0));
+ std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps));
+ SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, &BeforeOps[0],
+ BeforeOps.size());
+ NewNodes.push_back(NewNode);
+
+ // Emit the store instruction.
+ if (FoldedStore) {
+ AddrOps.pop_back();
+ AddrOps.push_back(SDValue(NewNode, 0));
+ AddrOps.push_back(Chain);
+ std::pair<MachineInstr::mmo_iterator,
+ MachineInstr::mmo_iterator> MMOs =
+ MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+ cast<MachineSDNode>(N)->memoperands_end());
+ bool isAligned = (*MMOs.first)->getAlignment() >= 16;
+ SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
+ isAligned, TM),
+ dl, MVT::Other,
+ &AddrOps[0], AddrOps.size());
+ NewNodes.push_back(Store);
+
+ // Preserve memory reference information.
+ cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+ }
+
+ return true;
+}
+
+unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+ bool UnfoldLoad, bool UnfoldStore,
+ unsigned *LoadRegIndex) const {
+ DenseMap<unsigned*, std::pair<unsigned,unsigned> >::const_iterator I =
+ MemOp2RegOpTable.find((unsigned*)Opc);
+ if (I == MemOp2RegOpTable.end())
+ return 0;
+ bool FoldedLoad = I->second.second & (1 << 4);
+ bool FoldedStore = I->second.second & (1 << 5);
+ if (UnfoldLoad && !FoldedLoad)
+ return 0;
+ if (UnfoldStore && !FoldedStore)
+ return 0;
+ if (LoadRegIndex)
+ *LoadRegIndex = I->second.second & 0xf;
+ return I->second.first;
+}
+
+bool
+X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+ int64_t &Offset1, int64_t &Offset2) const {
+ if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+ return false;
+ unsigned Opc1 = Load1->getMachineOpcode();
+ unsigned Opc2 = Load2->getMachineOpcode();
+ switch (Opc1) {
+ default: return false;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp32m:
+ case X86::LD_Fp64m:
+ case X86::LD_Fp80m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::FsMOVAPSrm:
+ case X86::FsMOVAPDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVUPSrm_Int:
+ case X86::MOVAPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ case X86::MOVDQUrm_Int:
+ break;
+ }
+ switch (Opc2) {
+ default: return false;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp32m:
+ case X86::LD_Fp64m:
+ case X86::LD_Fp80m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ case X86::FsMOVAPSrm:
+ case X86::FsMOVAPDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVUPSrm:
+ case X86::MOVUPSrm_Int:
+ case X86::MOVAPDrm:
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ case X86::MOVDQUrm_Int:
+ break;
+ }
+
+ // Check if chain operands and base addresses match.
+ if (Load1->getOperand(0) != Load2->getOperand(0) ||
+ Load1->getOperand(5) != Load2->getOperand(5))
+ return false;
+ // Segment operands should match as well.
+ if (Load1->getOperand(4) != Load2->getOperand(4))
+ return false;
+ // Scale should be 1, Index should be Reg0.
+ if (Load1->getOperand(1) == Load2->getOperand(1) &&
+ Load1->getOperand(2) == Load2->getOperand(2)) {
+ if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
+ return false;
+
+ // Now let's examine the displacements.
+ if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
+ isa<ConstantSDNode>(Load2->getOperand(3))) {
+ Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
+ Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
+ return true;
+ }
+ }
+ return false;
+}
+
+bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+ int64_t Offset1, int64_t Offset2,
+ unsigned NumLoads) const {
+ assert(Offset2 > Offset1);
+ if ((Offset2 - Offset1) / 8 > 64)
+ return false;
+
+ unsigned Opc1 = Load1->getMachineOpcode();
+ unsigned Opc2 = Load2->getMachineOpcode();
+ if (Opc1 != Opc2)
+ return false; // FIXME: overly conservative?
+
+ switch (Opc1) {
+ default: break;
+ case X86::LD_Fp32m:
+ case X86::LD_Fp64m:
+ case X86::LD_Fp80m:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ return false;
+ }
+
+ EVT VT = Load1->getValueType(0);
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: {
+ // XMM registers. In 64-bit mode we can be a bit more aggressive since we
+ // have 16 of them to play with.
+ if (TM.getSubtargetImpl()->is64Bit()) {
+ if (NumLoads >= 3)
+ return false;
+ } else if (NumLoads)
+ return false;
+ break;
+ }
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::i64:
+ case MVT::f32:
+ case MVT::f64:
+ if (NumLoads)
+ return false;
+ }
+
+ return true;
+}
+
+
+bool X86InstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+ assert(Cond.size() == 1 && "Invalid X86 branch condition!");
+ X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
+ if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
+ return true;
+ Cond[0].setImm(GetOppositeBranchCondition(CC));
+ return false;
+}
+
+bool X86InstrInfo::
+isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+ // FIXME: Return false for x87 stack register classes for now. We can't
+ // allow any loads of these registers before FpGet_ST0_80.
+ return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
+ RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
+}
+
+
+/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or higher)
+/// register? e.g. r8, xmm8, xmm13, etc.
+bool X86InstrInfo::isX86_64ExtendedReg(unsigned RegNo) {
+ switch (RegNo) {
+ default: break;
+ case X86::R8: case X86::R9: case X86::R10: case X86::R11:
+ case X86::R12: case X86::R13: case X86::R14: case X86::R15:
+ case X86::R8D: case X86::R9D: case X86::R10D: case X86::R11D:
+ case X86::R12D: case X86::R13D: case X86::R14D: case X86::R15D:
+ case X86::R8W: case X86::R9W: case X86::R10W: case X86::R11W:
+ case X86::R12W: case X86::R13W: case X86::R14W: case X86::R15W:
+ case X86::R8B: case X86::R9B: case X86::R10B: case X86::R11B:
+ case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B:
+ case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11:
+ case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+ return true;
+ }
+ return false;
+}
+
+
+/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+unsigned X86InstrInfo::determineREX(const MachineInstr &MI) {
+ unsigned REX = 0;
+ const TargetInstrDesc &Desc = MI.getDesc();
+
+ // Pseudo instructions do not need REX prefix byte.
+ if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo)
+ return 0;
+ if (Desc.TSFlags & X86II::REX_W)
+ REX |= 1 << 3;
+
+ unsigned NumOps = Desc.getNumOperands();
+ if (NumOps) {
+ bool isTwoAddr = NumOps > 1 &&
+ Desc.getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+ // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+ unsigned i = isTwoAddr ? 1 : 0;
+ for (unsigned e = NumOps; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ if (isX86_64NonExtLowByteReg(Reg))
+ REX |= 0x40;
+ }
+ }
+
+ switch (Desc.TSFlags & X86II::FormMask) {
+ case X86II::MRMInitReg:
+ if (isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= (1 << 0) | (1 << 2);
+ break;
+ case X86II::MRMSrcReg: {
+ if (isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= 1 << 2;
+ i = isTwoAddr ? 2 : 1;
+ for (unsigned e = NumOps; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (isX86_64ExtendedReg(MO))
+ REX |= 1 << 0;
+ }
+ break;
+ }
+ case X86II::MRMSrcMem: {
+ if (isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= 1 << 2;
+ unsigned Bit = 0;
+ i = isTwoAddr ? 2 : 1;
+ for (; i != NumOps; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ if (isX86_64ExtendedReg(MO))
+ REX |= 1 << Bit;
+ Bit++;
+ }
+ }
+ break;
+ }
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m:
+ case X86II::MRMDestMem: {
+ unsigned e = (isTwoAddr ? X86AddrNumOperands+1 : X86AddrNumOperands);
+ i = isTwoAddr ? 1 : 0;
+ if (NumOps > e && isX86_64ExtendedReg(MI.getOperand(e)))
+ REX |= 1 << 2;
+ unsigned Bit = 0;
+ for (; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ if (isX86_64ExtendedReg(MO))
+ REX |= 1 << Bit;
+ Bit++;
+ }
+ }
+ break;
+ }
+ default: {
+ if (isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= 1 << 0;
+ i = isTwoAddr ? 2 : 1;
+ for (unsigned e = NumOps; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (isX86_64ExtendedReg(MO))
+ REX |= 1 << 2;
+ }
+ break;
+ }
+ }
+ }
+ return REX;
+}
+
+/// sizePCRelativeBlockAddress - This method returns the size of a PC
+/// relative block address instruction
+///
+static unsigned sizePCRelativeBlockAddress() {
+ return 4;
+}
+
+/// sizeGlobalAddress - Give the size of the emission of this global address
+///
+static unsigned sizeGlobalAddress(bool dword) {
+ return dword ? 8 : 4;
+}
+
+/// sizeConstPoolAddress - Give the size of the emission of this constant
+/// pool address
+///
+static unsigned sizeConstPoolAddress(bool dword) {
+ return dword ? 8 : 4;
+}
+
+/// sizeExternalSymbolAddress - Give the size of the emission of this external
+/// symbol
+///
+static unsigned sizeExternalSymbolAddress(bool dword) {
+ return dword ? 8 : 4;
+}
+
+/// sizeJumpTableAddress - Give the size of the emission of this jump
+/// table address
+///
+static unsigned sizeJumpTableAddress(bool dword) {
+ return dword ? 8 : 4;
+}
+
+static unsigned sizeConstant(unsigned Size) {
+ return Size;
+}
+
+static unsigned sizeRegModRMByte(){
+ return 1;
+}
+
+static unsigned sizeSIBByte(){
+ return 1;
+}
+
+static unsigned getDisplacementFieldSize(const MachineOperand *RelocOp) {
+ unsigned FinalSize = 0;
+ // If this is a simple integer displacement that doesn't require a relocation.
+ if (!RelocOp) {
+ FinalSize += sizeConstant(4);
+ return FinalSize;
+ }
+
+ // Otherwise, this is something that requires a relocation.
+ if (RelocOp->isGlobal()) {
+ FinalSize += sizeGlobalAddress(false);
+ } else if (RelocOp->isCPI()) {
+ FinalSize += sizeConstPoolAddress(false);
+ } else if (RelocOp->isJTI()) {
+ FinalSize += sizeJumpTableAddress(false);
+ } else {
+ llvm_unreachable("Unknown value to relocate!");
+ }
+ return FinalSize;
+}
+
+static unsigned getMemModRMByteSize(const MachineInstr &MI, unsigned Op,
+ bool IsPIC, bool Is64BitMode) {
+ const MachineOperand &Op3 = MI.getOperand(Op+3);
+ int DispVal = 0;
+ const MachineOperand *DispForReloc = 0;
+ unsigned FinalSize = 0;
+
+ // Figure out what sort of displacement we have to handle here.
+ if (Op3.isGlobal()) {
+ DispForReloc = &Op3;
+ } else if (Op3.isCPI()) {
+ if (Is64BitMode || IsPIC) {
+ DispForReloc = &Op3;
+ } else {
+ DispVal = 1;
+ }
+ } else if (Op3.isJTI()) {
+ if (Is64BitMode || IsPIC) {
+ DispForReloc = &Op3;
+ } else {
+ DispVal = 1;
+ }
+ } else {
+ DispVal = 1;
+ }
+
+ const MachineOperand &Base = MI.getOperand(Op);
+ const MachineOperand &IndexReg = MI.getOperand(Op+2);
+
+ unsigned BaseReg = Base.getReg();
+
+ // Is a SIB byte needed?
+ if ((!Is64BitMode || DispForReloc || BaseReg != 0) &&
+ IndexReg.getReg() == 0 &&
+ (BaseReg == 0 || X86RegisterInfo::getX86RegNum(BaseReg) != N86::ESP)) {
+ if (BaseReg == 0) { // Just a displacement?
+ // Emit special case [disp32] encoding
+ ++FinalSize;
+ FinalSize += getDisplacementFieldSize(DispForReloc);
+ } else {
+ unsigned BaseRegNo = X86RegisterInfo::getX86RegNum(BaseReg);
+ if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
+ // Emit simple indirect register encoding... [EAX] f.e.
+ ++FinalSize;
+ // Be pessimistic and assume it's a disp32, not a disp8
+ } else {
+ // Emit the most general non-SIB encoding: [REG+disp32]
+ ++FinalSize;
+ FinalSize += getDisplacementFieldSize(DispForReloc);
+ }
+ }
+
+ } else { // We need a SIB byte, so start by outputting the ModR/M byte first
+ assert(IndexReg.getReg() != X86::ESP &&
+ IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+ bool ForceDisp32 = false;
+ if (BaseReg == 0 || DispForReloc) {
+ // Emit the normal disp32 encoding.
+ ++FinalSize;
+ ForceDisp32 = true;
+ } else {
+ ++FinalSize;
+ }
+
+ FinalSize += sizeSIBByte();
+
+ // Do we need to output a displacement?
+ if (DispVal != 0 || ForceDisp32) {
+ FinalSize += getDisplacementFieldSize(DispForReloc);
+ }
+ }
+ return FinalSize;
+}
+
+
+static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
+ const TargetInstrDesc *Desc,
+ bool IsPIC, bool Is64BitMode) {
+
+ unsigned Opcode = Desc->Opcode;
+ unsigned FinalSize = 0;
+
+ // Emit the lock opcode prefix as needed.
+ if (Desc->TSFlags & X86II::LOCK) ++FinalSize;
+
+ // Emit segment override opcode prefix as needed.
+ switch (Desc->TSFlags & X86II::SegOvrMask) {
+ case X86II::FS:
+ case X86II::GS:
+ ++FinalSize;
+ break;
+ default: llvm_unreachable("Invalid segment!");
+ case 0: break; // No segment override!
+ }
+
+ // Emit the repeat opcode prefix as needed.
+ if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP) ++FinalSize;
+
+ // Emit the operand size opcode prefix as needed.
+ if (Desc->TSFlags & X86II::OpSize) ++FinalSize;
+
+ // Emit the address size opcode prefix as needed.
+ if (Desc->TSFlags & X86II::AdSize) ++FinalSize;
+
+ bool Need0FPrefix = false;
+ switch (Desc->TSFlags & X86II::Op0Mask) {
+ case X86II::TB: // Two-byte opcode prefix
+ case X86II::T8: // 0F 38
+ case X86II::TA: // 0F 3A
+ Need0FPrefix = true;
+ break;
+ case X86II::TF: // F2 0F 38
+ ++FinalSize;
+ Need0FPrefix = true;
+ break;
+ case X86II::REP: break; // already handled.
+ case X86II::XS: // F3 0F
+ ++FinalSize;
+ Need0FPrefix = true;
+ break;
+ case X86II::XD: // F2 0F
+ ++FinalSize;
+ Need0FPrefix = true;
+ break;
+ case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+ case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+ ++FinalSize;
+ break; // Two-byte opcode prefix
+ default: llvm_unreachable("Invalid prefix!");
+ case 0: break; // No prefix!
+ }
+
+ if (Is64BitMode) {
+ // REX prefix
+ unsigned REX = X86InstrInfo::determineREX(MI);
+ if (REX)
+ ++FinalSize;
+ }
+
+ // 0x0F escape code must be emitted just before the opcode.
+ if (Need0FPrefix)
+ ++FinalSize;
+
+ switch (Desc->TSFlags & X86II::Op0Mask) {
+ case X86II::T8: // 0F 38
+ ++FinalSize;
+ break;
+ case X86II::TA: // 0F 3A
+ ++FinalSize;
+ break;
+ case X86II::TF: // F2 0F 38
+ ++FinalSize;
+ break;
+ }
+
+ // If this is a two-address instruction, skip one of the register operands.
+ unsigned NumOps = Desc->getNumOperands();
+ unsigned CurOp = 0;
+ if (NumOps > 1 && Desc->getOperandConstraint(1, TOI::TIED_TO) != -1)
+ CurOp++;
+ else if (NumOps > 2 && Desc->getOperandConstraint(NumOps-1, TOI::TIED_TO)== 0)
+ // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
+ --NumOps;
+
+ switch (Desc->TSFlags & X86II::FormMask) {
+ default: llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!");
+ case X86II::Pseudo:
+ // Remember the current PC offset, this is the PIC relocation
+ // base address.
+ switch (Opcode) {
+ default:
+ break;
+ case TargetOpcode::INLINEASM: {
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+ FinalSize += TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
+ *MF->getTarget().getMCAsmInfo());
+ break;
+ }
+ case TargetOpcode::DBG_LABEL:
+ case TargetOpcode::EH_LABEL:
+ case TargetOpcode::DBG_VALUE:
+ break;
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::KILL:
+ case X86::FP_REG_KILL:
+ break;
+ case X86::MOVPC32r: {
+ // This emits the "call" portion of this pseudo instruction.
+ ++FinalSize;
+ FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
+ break;
+ }
+ }
+ CurOp = NumOps;
+ break;
+ case X86II::RawFrm:
+ ++FinalSize;
+
+ if (CurOp != NumOps) {
+ const MachineOperand &MO = MI.getOperand(CurOp++);
+ if (MO.isMBB()) {
+ FinalSize += sizePCRelativeBlockAddress();
+ } else if (MO.isGlobal()) {
+ FinalSize += sizeGlobalAddress(false);
+ } else if (MO.isSymbol()) {
+ FinalSize += sizeExternalSymbolAddress(false);
+ } else if (MO.isImm()) {
+ FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
+ } else {
+ llvm_unreachable("Unknown RawFrm operand!");
+ }
+ }
+ break;
+
+ case X86II::AddRegFrm:
+ ++FinalSize;
+ ++CurOp;
+
+ if (CurOp != NumOps) {
+ const MachineOperand &MO1 = MI.getOperand(CurOp++);
+ unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+ if (MO1.isImm())
+ FinalSize += sizeConstant(Size);
+ else {
+ bool dword = false;
+ if (Opcode == X86::MOV64ri)
+ dword = true;
+ if (MO1.isGlobal()) {
+ FinalSize += sizeGlobalAddress(dword);
+ } else if (MO1.isSymbol())
+ FinalSize += sizeExternalSymbolAddress(dword);
+ else if (MO1.isCPI())
+ FinalSize += sizeConstPoolAddress(dword);
+ else if (MO1.isJTI())
+ FinalSize += sizeJumpTableAddress(dword);
+ }
+ }
+ break;
+
+ case X86II::MRMDestReg: {
+ ++FinalSize;
+ FinalSize += sizeRegModRMByte();
+ CurOp += 2;
+ if (CurOp != NumOps) {
+ ++CurOp;
+ FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
+ }
+ break;
+ }
+ case X86II::MRMDestMem: {
+ ++FinalSize;
+ FinalSize += getMemModRMByteSize(MI, CurOp, IsPIC, Is64BitMode);
+ CurOp += X86AddrNumOperands + 1;
+ if (CurOp != NumOps) {
+ ++CurOp;
+ FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
+ }
+ break;
+ }
+
+ case X86II::MRMSrcReg:
+ ++FinalSize;
+ FinalSize += sizeRegModRMByte();
+ CurOp += 2;
+ if (CurOp != NumOps) {
+ ++CurOp;
+ FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
+ }
+ break;
+
+ case X86II::MRMSrcMem: {
+ int AddrOperands;
+ if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+ Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+ AddrOperands = X86AddrNumOperands - 1; // No segment register
+ else
+ AddrOperands = X86AddrNumOperands;
+
+ ++FinalSize;
+ FinalSize += getMemModRMByteSize(MI, CurOp+1, IsPIC, Is64BitMode);
+ CurOp += AddrOperands + 1;
+ if (CurOp != NumOps) {
+ ++CurOp;
+ FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
+ }
+ break;
+ }
+
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r:
+ ++FinalSize;
+ if (Desc->getOpcode() == X86::LFENCE ||
+ Desc->getOpcode() == X86::MFENCE) {
+ // Special handling of lfence and mfence;
+ FinalSize += sizeRegModRMByte();
+ } else if (Desc->getOpcode() == X86::MONITOR ||
+ Desc->getOpcode() == X86::MWAIT) {
+ // Special handling of monitor and mwait.
+ FinalSize += sizeRegModRMByte() + 1; // +1 for the opcode.
+ } else {
+ ++CurOp;
+ FinalSize += sizeRegModRMByte();
+ }
+
+ if (CurOp != NumOps) {
+ const MachineOperand &MO1 = MI.getOperand(CurOp++);
+ unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+ if (MO1.isImm())
+ FinalSize += sizeConstant(Size);
+ else {
+ bool dword = false;
+ if (Opcode == X86::MOV64ri32)
+ dword = true;
+ if (MO1.isGlobal()) {
+ FinalSize += sizeGlobalAddress(dword);
+ } else if (MO1.isSymbol())
+ FinalSize += sizeExternalSymbolAddress(dword);
+ else if (MO1.isCPI())
+ FinalSize += sizeConstPoolAddress(dword);
+ else if (MO1.isJTI())
+ FinalSize += sizeJumpTableAddress(dword);
+ }
+ }
+ break;
+
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m: {
+
+ ++FinalSize;
+ FinalSize += getMemModRMByteSize(MI, CurOp, IsPIC, Is64BitMode);
+ CurOp += X86AddrNumOperands;
+
+ if (CurOp != NumOps) {
+ const MachineOperand &MO = MI.getOperand(CurOp++);
+ unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+ if (MO.isImm())
+ FinalSize += sizeConstant(Size);
+ else {
+ bool dword = false;
+ if (Opcode == X86::MOV64mi32)
+ dword = true;
+ if (MO.isGlobal()) {
+ FinalSize += sizeGlobalAddress(dword);
+ } else if (MO.isSymbol())
+ FinalSize += sizeExternalSymbolAddress(dword);
+ else if (MO.isCPI())
+ FinalSize += sizeConstPoolAddress(dword);
+ else if (MO.isJTI())
+ FinalSize += sizeJumpTableAddress(dword);
+ }
+ }
+ break;
+
+ case X86II::MRM_C1:
+ case X86II::MRM_C8:
+ case X86II::MRM_C9:
+ case X86II::MRM_E8:
+ case X86II::MRM_F0:
+ FinalSize += 2;
+ break;
+ }
+
+ case X86II::MRMInitReg:
+ ++FinalSize;
+ // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
+ FinalSize += sizeRegModRMByte();
+ ++CurOp;
+ break;
+ }
+
+ if (!Desc->isVariadic() && CurOp != NumOps) {
+ std::string msg;
+ raw_string_ostream Msg(msg);
+ Msg << "Cannot determine size: " << MI;
+ report_fatal_error(Msg.str());
+ }
+
+
+ return FinalSize;
+}
+
+
+unsigned X86InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+ const TargetInstrDesc &Desc = MI->getDesc();
+ bool IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+ bool Is64BitMode = TM.getSubtargetImpl()->is64Bit();
+ unsigned Size = GetInstSizeWithDesc(*MI, &Desc, IsPIC, Is64BitMode);
+ if (Desc.getOpcode() == X86::MOVPC32r)
+ Size += GetInstSizeWithDesc(*MI, &get(X86::POP32r), IsPIC, Is64BitMode);
+ return Size;
+}
+
+/// getGlobalBaseReg - Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+ assert(!TM.getSubtarget<X86Subtarget>().is64Bit() &&
+ "X86-64 PIC uses RIP relative addressing");
+
+ X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+ unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+ if (GlobalBaseReg != 0)
+ return GlobalBaseReg;
+
+ // Insert the set of GlobalBaseReg into the first MBB of the function
+ MachineBasicBlock &FirstMBB = MF->front();
+ MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+ DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+ MachineRegisterInfo &RegInfo = MF->getRegInfo();
+ unsigned PC = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+
+ const TargetInstrInfo *TII = TM.getInstrInfo();
+ // Operand of MovePCtoStack is completely ignored by asm printer. It's
+ // only used in JIT code emission as displacement to pc.
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
+
+ // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+ // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
+ if (TM.getSubtarget<X86Subtarget>().isPICStyleGOT()) {
+ GlobalBaseReg = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+ // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
+ BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+ .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+ X86II::MO_GOT_ABSOLUTE_ADDRESS);
+ } else {
+ GlobalBaseReg = PC;
+ }
+
+ X86FI->setGlobalBaseReg(GlobalBaseReg);
+ return GlobalBaseReg;
+}
+
+// These are the replaceable SSE instructions. Some of these have Int variants
+// that we don't include here. We don't want to replace instructions selected
+// by intrinsics.
+static const unsigned ReplaceableInstrs[][3] = {
+ //PackedInt PackedSingle PackedDouble
+ { X86::MOVAPSmr, X86::MOVAPDmr, X86::MOVDQAmr },
+ { X86::MOVAPSrm, X86::MOVAPDrm, X86::MOVDQArm },
+ { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr },
+ { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr },
+ { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm },
+ { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr },
+ { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm },
+ { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr },
+ { X86::ANDPSrm, X86::ANDPDrm, X86::PANDrm },
+ { X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr },
+ { X86::ORPSrm, X86::ORPDrm, X86::PORrm },
+ { X86::ORPSrr, X86::ORPDrr, X86::PORrr },
+ { X86::V_SET0PS, X86::V_SET0PD, X86::V_SET0PI },
+ { X86::XORPSrm, X86::XORPDrm, X86::PXORrm },
+ { X86::XORPSrr, X86::XORPDrr, X86::PXORrr },
+};
+
+// FIXME: Some shuffle and unpack instructions have equivalents in different
+// domains, but they require a bit more work than just switching opcodes.
+
+static const unsigned *lookup(unsigned opcode, unsigned domain) {
+ for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
+ if (ReplaceableInstrs[i][domain-1] == opcode)
+ return ReplaceableInstrs[i];
+ return 0;
+}
+
+std::pair<uint16_t, uint16_t>
+X86InstrInfo::GetSSEDomain(const MachineInstr *MI) const {
+ uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+ return std::make_pair(domain,
+ domain && lookup(MI->getOpcode(), domain) ? 0xe : 0);
+}
+
+void X86InstrInfo::SetSSEDomain(MachineInstr *MI, unsigned Domain) const {
+ assert(Domain>0 && Domain<4 && "Invalid execution domain");
+ uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+ assert(dom && "Not an SSE instruction");
+ const unsigned *table = lookup(MI->getOpcode(), dom);
+ assert(table && "Cannot change domain");
+ MI->setDesc(get(table[Domain-1]));
+}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+ NopInst.setOpcode(X86::NOOP);
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
new file mode 100644
index 0000000..62d7c74
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
@@ -0,0 +1,762 @@
+//===- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*- ===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRUCTIONINFO_H
+#define X86INSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "X86.h"
+#include "X86RegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+ class X86RegisterInfo;
+ class X86TargetMachine;
+
+namespace X86 {
+ // X86 specific condition code. These correspond to X86_*_COND in
+ // X86InstrInfo.td. They must be kept in synch.
+ enum CondCode {
+ COND_A = 0,
+ COND_AE = 1,
+ COND_B = 2,
+ COND_BE = 3,
+ COND_E = 4,
+ COND_G = 5,
+ COND_GE = 6,
+ COND_L = 7,
+ COND_LE = 8,
+ COND_NE = 9,
+ COND_NO = 10,
+ COND_NP = 11,
+ COND_NS = 12,
+ COND_O = 13,
+ COND_P = 14,
+ COND_S = 15,
+
+ // Artificial condition codes. These are used by AnalyzeBranch
+ // to indicate a block terminated with two conditional branches to
+ // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
+ // which can't be represented on x86 with a single condition. These
+ // are never used in MachineInstrs.
+ COND_NE_OR_P,
+ COND_NP_OR_E,
+
+ COND_INVALID
+ };
+
+ // Turn condition code into conditional branch opcode.
+ unsigned GetCondBranchFromCond(CondCode CC);
+
+ /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+ /// e.g. turning COND_E to COND_NE.
+ CondCode GetOppositeBranchCondition(X86::CondCode CC);
+
+}
+
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+ /// Target Operand Flag enum.
+ enum TOF {
+ //===------------------------------------------------------------------===//
+ // X86 Specific MachineOperand flags.
+
+ MO_NO_FLAG,
+
+ /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
+ /// relocation of:
+ /// SYMBOL_LABEL + [. - PICBASELABEL]
+ MO_GOT_ABSOLUTE_ADDRESS,
+
+ /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
+ /// immediate should get the value of the symbol minus the PIC base label:
+ /// SYMBOL_LABEL - PICBASELABEL
+ MO_PIC_BASE_OFFSET,
+
+ /// MO_GOT - On a symbol operand this indicates that the immediate is the
+ /// offset to the GOT entry for the symbol name from the base of the GOT.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @GOT
+ MO_GOT,
+
+ /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
+ /// the offset to the location of the symbol name from the base of the GOT.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @GOTOFF
+ MO_GOTOFF,
+
+ /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
+ /// offset to the GOT entry for the symbol name from the current code
+ /// location.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @GOTPCREL
+ MO_GOTPCREL,
+
+ /// MO_PLT - On a symbol operand this indicates that the immediate is
+ /// offset to the PLT entry of symbol name from the current code location.
+ ///
+ /// See the X86-64 ELF ABI supplement for more details.
+ /// SYMBOL_LABEL @PLT
+ MO_PLT,
+
+ /// MO_TLSGD - On a symbol operand this indicates that the immediate is
+ /// some TLS offset.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TLSGD
+ MO_TLSGD,
+
+ /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
+ /// some TLS offset.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @GOTTPOFF
+ MO_GOTTPOFF,
+
+ /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
+ /// some TLS offset.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @INDNTPOFF
+ MO_INDNTPOFF,
+
+ /// MO_TPOFF - On a symbol operand this indicates that the immediate is
+ /// some TLS offset.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @TPOFF
+ MO_TPOFF,
+
+ /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
+ /// some TLS offset.
+ ///
+ /// See 'ELF Handling for Thread-Local Storage' for more details.
+ /// SYMBOL_LABEL @NTPOFF
+ MO_NTPOFF,
+
+ /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
+ /// reference is actually to the "__imp_FOO" symbol. This is used for
+ /// dllimport linkage on windows.
+ MO_DLLIMPORT,
+
+ /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
+ /// reference is actually to the "FOO$stub" symbol. This is used for calls
+ /// and jumps to external functions on Tiger and before.
+ MO_DARWIN_STUB,
+
+ /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
+ /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
+ /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+ MO_DARWIN_NONLAZY,
+
+ /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates
+ /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
+ /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+ MO_DARWIN_NONLAZY_PIC_BASE,
+
+ /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this
+ /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE",
+ /// which is a PIC-base-relative reference to a hidden dyld lazy pointer
+ /// stub.
+ MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE
+ };
+}
+
+/// isGlobalStubReference - Return true if the specified TargetFlag operand is
+/// a reference to a stub for a global, not the global itself.
+inline static bool isGlobalStubReference(unsigned char TargetFlag) {
+ switch (TargetFlag) {
+ case X86II::MO_DLLIMPORT: // dllimport stub.
+ case X86II::MO_GOTPCREL: // rip-relative GOT reference.
+ case X86II::MO_GOT: // normal GOT reference.
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref.
+ case X86II::MO_DARWIN_NONLAZY: // Normal $non_lazy_ptr ref.
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Hidden $non_lazy_ptr ref.
+ return true;
+ default:
+ return false;
+ }
+}
+
+/// isGlobalRelativeToPICBase - Return true if the specified global value
+/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg). If this
+/// is true, the addressing mode has the PIC base register added in (e.g. EBX).
+inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
+ switch (TargetFlag) {
+ case X86II::MO_GOTOFF: // isPICStyleGOT: local global.
+ case X86II::MO_GOT: // isPICStyleGOT: other global.
+ case X86II::MO_PIC_BASE_OFFSET: // Darwin local global.
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Darwin/32 external global.
+ case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Darwin/32 hidden global.
+ return true;
+ default:
+ return false;
+ }
+}
+
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+ enum {
+ //===------------------------------------------------------------------===//
+ // Instruction encodings. These are the standard/most common forms for X86
+ // instructions.
+ //
+
+ // PseudoFrm - This represents an instruction that is a pseudo instruction
+ // or one that has not been implemented yet. It is illegal to code generate
+ // it, but tolerated for intermediate implementation stages.
+ Pseudo = 0,
+
+ /// Raw - This form is for instructions that don't have any operands, so
+ /// they are just a fixed opcode value, like 'leave'.
+ RawFrm = 1,
+
+ /// AddRegFrm - This form is used for instructions like 'push r32' that have
+ /// their one register operand added to their opcode.
+ AddRegFrm = 2,
+
+ /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is a register.
+ ///
+ MRMDestReg = 3,
+
+ /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is memory.
+ ///
+ MRMDestMem = 4,
+
+ /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is a register.
+ ///
+ MRMSrcReg = 5,
+
+ /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is memory.
+ ///
+ MRMSrcMem = 6,
+
+ /// MRM[0-7][rm] - These forms are used to represent instructions that use
+ /// a Mod/RM byte, and use the middle field to hold extended opcode
+ /// information. In the intel manual these are represented as /0, /1, ...
+ ///
+
+ // First, instructions that operate on a register r/m operand...
+ MRM0r = 16, MRM1r = 17, MRM2r = 18, MRM3r = 19, // Format /0 /1 /2 /3
+ MRM4r = 20, MRM5r = 21, MRM6r = 22, MRM7r = 23, // Format /4 /5 /6 /7
+
+ // Next, instructions that operate on a memory r/m operand...
+ MRM0m = 24, MRM1m = 25, MRM2m = 26, MRM3m = 27, // Format /0 /1 /2 /3
+ MRM4m = 28, MRM5m = 29, MRM6m = 30, MRM7m = 31, // Format /4 /5 /6 /7
+
+ // MRMInitReg - This form is used for instructions whose source and
+ // destinations are the same register.
+ MRMInitReg = 32,
+
+ //// MRM_C1 - A mod/rm byte of exactly 0xC1.
+ MRM_C1 = 33,
+ MRM_C2 = 34,
+ MRM_C3 = 35,
+ MRM_C4 = 36,
+ MRM_C8 = 37,
+ MRM_C9 = 38,
+ MRM_E8 = 39,
+ MRM_F0 = 40,
+ MRM_F8 = 41,
+ MRM_F9 = 42,
+
+ FormMask = 63,
+
+ //===------------------------------------------------------------------===//
+ // Actual flags...
+
+ // OpSize - Set if this instruction requires an operand size prefix (0x66),
+ // which most often indicates that the instruction operates on 16 bit data
+ // instead of 32 bit data.
+ OpSize = 1 << 6,
+
+ // AsSize - Set if this instruction requires an operand size prefix (0x67),
+ // which most often indicates that the instruction address 16 bit address
+ // instead of 32 bit address (or 32 bit address in 64 bit mode).
+ AdSize = 1 << 7,
+
+ //===------------------------------------------------------------------===//
+ // Op0Mask - There are several prefix bytes that are used to form two byte
+ // opcodes. These are currently 0x0F, 0xF3, and 0xD8-0xDF. This mask is
+ // used to obtain the setting of this field. If no bits in this field is
+ // set, there is no prefix byte for obtaining a multibyte opcode.
+ //
+ Op0Shift = 8,
+ Op0Mask = 0xF << Op0Shift,
+
+ // TB - TwoByte - Set if this instruction has a two byte opcode, which
+ // starts with a 0x0F byte before the real opcode.
+ TB = 1 << Op0Shift,
+
+ // REP - The 0xF3 prefix byte indicating repetition of the following
+ // instruction.
+ REP = 2 << Op0Shift,
+
+ // D8-DF - These escape opcodes are used by the floating point unit. These
+ // values must remain sequential.
+ D8 = 3 << Op0Shift, D9 = 4 << Op0Shift,
+ DA = 5 << Op0Shift, DB = 6 << Op0Shift,
+ DC = 7 << Op0Shift, DD = 8 << Op0Shift,
+ DE = 9 << Op0Shift, DF = 10 << Op0Shift,
+
+ // XS, XD - These prefix codes are for single and double precision scalar
+ // floating point operations performed in the SSE registers.
+ XD = 11 << Op0Shift, XS = 12 << Op0Shift,
+
+ // T8, TA - Prefix after the 0x0F prefix.
+ T8 = 13 << Op0Shift, TA = 14 << Op0Shift,
+
+ // TF - Prefix before and after 0x0F
+ TF = 15 << Op0Shift,
+
+ //===------------------------------------------------------------------===//
+ // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+ // They are used to specify GPRs and SSE registers, 64-bit operand size,
+ // etc. We only cares about REX.W and REX.R bits and only the former is
+ // statically determined.
+ //
+ REXShift = 12,
+ REX_W = 1 << REXShift,
+
+ //===------------------------------------------------------------------===//
+ // This three-bit field describes the size of an immediate operand. Zero is
+ // unused so that we can tell if we forgot to set a value.
+ ImmShift = 13,
+ ImmMask = 7 << ImmShift,
+ Imm8 = 1 << ImmShift,
+ Imm8PCRel = 2 << ImmShift,
+ Imm16 = 3 << ImmShift,
+ Imm32 = 4 << ImmShift,
+ Imm32PCRel = 5 << ImmShift,
+ Imm64 = 6 << ImmShift,
+
+ //===------------------------------------------------------------------===//
+ // FP Instruction Classification... Zero is non-fp instruction.
+
+ // FPTypeMask - Mask for all of the FP types...
+ FPTypeShift = 16,
+ FPTypeMask = 7 << FPTypeShift,
+
+ // NotFP - The default, set for instructions that do not use FP registers.
+ NotFP = 0 << FPTypeShift,
+
+ // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+ ZeroArgFP = 1 << FPTypeShift,
+
+ // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+ OneArgFP = 2 << FPTypeShift,
+
+ // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+ // result back to ST(0). For example, fcos, fsqrt, etc.
+ //
+ OneArgFPRW = 3 << FPTypeShift,
+
+ // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+ // explicit argument, storing the result to either ST(0) or the implicit
+ // argument. For example: fadd, fsub, fmul, etc...
+ TwoArgFP = 4 << FPTypeShift,
+
+ // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+ // explicit argument, but have no destination. Example: fucom, fucomi, ...
+ CompareFP = 5 << FPTypeShift,
+
+ // CondMovFP - "2 operand" floating point conditional move instructions.
+ CondMovFP = 6 << FPTypeShift,
+
+ // SpecialFP - Special instruction forms. Dispatch by opcode explicitly.
+ SpecialFP = 7 << FPTypeShift,
+
+ // Lock prefix
+ LOCKShift = 19,
+ LOCK = 1 << LOCKShift,
+
+ // Segment override prefixes. Currently we just need ability to address
+ // stuff in gs and fs segments.
+ SegOvrShift = 20,
+ SegOvrMask = 3 << SegOvrShift,
+ FS = 1 << SegOvrShift,
+ GS = 2 << SegOvrShift,
+
+ // Execution domain for SSE instructions in bits 22, 23.
+ // 0 in bits 22-23 means normal, non-SSE instruction.
+ SSEDomainShift = 22,
+
+ OpcodeShift = 24,
+ OpcodeMask = 0xFF << OpcodeShift
+ };
+
+ // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+ // specified machine instruction.
+ //
+ static inline unsigned char getBaseOpcodeFor(unsigned TSFlags) {
+ return TSFlags >> X86II::OpcodeShift;
+ }
+
+ static inline bool hasImm(unsigned TSFlags) {
+ return (TSFlags & X86II::ImmMask) != 0;
+ }
+
+ /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
+ /// of the specified instruction.
+ static inline unsigned getSizeOfImm(unsigned TSFlags) {
+ switch (TSFlags & X86II::ImmMask) {
+ default: assert(0 && "Unknown immediate size");
+ case X86II::Imm8:
+ case X86II::Imm8PCRel: return 1;
+ case X86II::Imm16: return 2;
+ case X86II::Imm32:
+ case X86II::Imm32PCRel: return 4;
+ case X86II::Imm64: return 8;
+ }
+ }
+
+ /// isImmPCRel - Return true if the immediate of the specified instruction's
+ /// TSFlags indicates that it is pc relative.
+ static inline unsigned isImmPCRel(unsigned TSFlags) {
+ switch (TSFlags & X86II::ImmMask) {
+ default: assert(0 && "Unknown immediate size");
+ case X86II::Imm8PCRel:
+ case X86II::Imm32PCRel:
+ return true;
+ case X86II::Imm8:
+ case X86II::Imm16:
+ case X86II::Imm32:
+ case X86II::Imm64:
+ return false;
+ }
+ }
+}
+
+const int X86AddrNumOperands = 5;
+
+inline static bool isScale(const MachineOperand &MO) {
+ return MO.isImm() &&
+ (MO.getImm() == 1 || MO.getImm() == 2 ||
+ MO.getImm() == 4 || MO.getImm() == 8);
+}
+
+inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) {
+ if (MI->getOperand(Op).isFI()) return true;
+ return Op+4 <= MI->getNumOperands() &&
+ MI->getOperand(Op ).isReg() && isScale(MI->getOperand(Op+1)) &&
+ MI->getOperand(Op+2).isReg() &&
+ (MI->getOperand(Op+3).isImm() ||
+ MI->getOperand(Op+3).isGlobal() ||
+ MI->getOperand(Op+3).isCPI() ||
+ MI->getOperand(Op+3).isJTI());
+}
+
+inline static bool isMem(const MachineInstr *MI, unsigned Op) {
+ if (MI->getOperand(Op).isFI()) return true;
+ return Op+5 <= MI->getNumOperands() &&
+ MI->getOperand(Op+4).isReg() &&
+ isLeaMem(MI, Op);
+}
+
+class X86InstrInfo : public TargetInstrInfoImpl {
+ X86TargetMachine &TM;
+ const X86RegisterInfo RI;
+
+ /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
+ /// RegOp2MemOpTable2 - Load / store folding opcode maps.
+ ///
+ DenseMap<unsigned*, std::pair<unsigned,unsigned> > RegOp2MemOpTable2Addr;
+ DenseMap<unsigned*, std::pair<unsigned,unsigned> > RegOp2MemOpTable0;
+ DenseMap<unsigned*, std::pair<unsigned,unsigned> > RegOp2MemOpTable1;
+ DenseMap<unsigned*, std::pair<unsigned,unsigned> > RegOp2MemOpTable2;
+
+ /// MemOp2RegOpTable - Load / store unfolding opcode map.
+ ///
+ DenseMap<unsigned*, std::pair<unsigned, unsigned> > MemOp2RegOpTable;
+
+public:
+ explicit X86InstrInfo(X86TargetMachine &tm);
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ virtual const X86RegisterInfo &getRegisterInfo() const { return RI; }
+
+ /// Return true if the instruction is a register to register move and return
+ /// the source and dest operands and their sub-register indices by reference.
+ virtual bool isMoveInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+ /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
+ /// extension instruction. That is, it's like a copy where it's legal for the
+ /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
+ /// true, then it's expected the pre-extension value is available as a subreg
+ /// of the result register. This also returns the sub-register index in
+ /// SubIdx.
+ virtual bool isCoalescableExtInstr(const MachineInstr &MI,
+ unsigned &SrcReg, unsigned &DstReg,
+ unsigned &SubIdx) const;
+
+ unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+ /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
+ /// stack locations as well. This uses a heuristic so it isn't
+ /// reliable for correctness.
+ unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+ int &FrameIndex) const;
+
+ /// hasLoadFromStackSlot - If the specified machine instruction has
+ /// a load from a stack slot, return true along with the FrameIndex
+ /// of the loaded stack slot and the machine mem operand containing
+ /// the reference. If not, return false. Unlike
+ /// isLoadFromStackSlot, this returns true for any instructions that
+ /// loads from the stack. This is a hint only and may not catch all
+ /// cases.
+ bool hasLoadFromStackSlot(const MachineInstr *MI,
+ const MachineMemOperand *&MMO,
+ int &FrameIndex) const;
+
+ unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+ /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
+ /// stack locations as well. This uses a heuristic so it isn't
+ /// reliable for correctness.
+ unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
+ int &FrameIndex) const;
+
+ /// hasStoreToStackSlot - If the specified machine instruction has a
+ /// store to a stack slot, return true along with the FrameIndex of
+ /// the loaded stack slot and the machine mem operand containing the
+ /// reference. If not, return false. Unlike isStoreToStackSlot,
+ /// this returns true for any instructions that loads from the
+ /// stack. This is a hint only and may not catch all cases.
+ bool hasStoreToStackSlot(const MachineInstr *MI,
+ const MachineMemOperand *&MMO,
+ int &FrameIndex) const;
+
+ bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
+ AliasAnalysis *AA) const;
+ void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ unsigned DestReg, unsigned SubIdx,
+ const MachineInstr *Orig,
+ const TargetRegisterInfo *TRI) const;
+
+ /// convertToThreeAddress - This method must be implemented by targets that
+ /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
+ /// may be able to convert a two-address instruction into a true
+ /// three-address instruction on demand. This allows the X86 target (for
+ /// example) to convert ADD and SHL instructions into LEA instructions if they
+ /// would require register copies due to two-addressness.
+ ///
+ /// This method returns a null pointer if the transformation cannot be
+ /// performed, otherwise it returns the new instruction.
+ ///
+ virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const;
+
+ /// commuteInstruction - We have a few instructions that must be hacked on to
+ /// commute them.
+ ///
+ virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const;
+
+ // Branch analysis.
+ virtual bool isUnpredicatedTerminator(const MachineInstr* MI) const;
+ virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ SmallVectorImpl<MachineOperand> &Cond,
+ bool AllowModify) const;
+ virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+ virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ const SmallVectorImpl<MachineOperand> &Cond) const;
+ virtual bool copyRegToReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, unsigned SrcReg,
+ const TargetRegisterClass *DestRC,
+ const TargetRegisterClass *SrcRC,
+ DebugLoc DL) const;
+ virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, bool isKill, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const;
+
+ virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+ virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) const;
+
+ virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+ SmallVectorImpl<MachineOperand> &Addr,
+ const TargetRegisterClass *RC,
+ MachineInstr::mmo_iterator MMOBegin,
+ MachineInstr::mmo_iterator MMOEnd,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+ virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const;
+
+ virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI,
+ const TargetRegisterInfo *TRI) const;
+
+ virtual
+ MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
+ int FrameIx, uint64_t Offset,
+ const MDNode *MDPtr,
+ DebugLoc DL) const;
+
+ /// foldMemoryOperand - If this target supports it, fold a load or store of
+ /// the specified stack slot into the specified machine instruction for the
+ /// specified operand(s). If this is possible, the target should perform the
+ /// folding and return true, otherwise it should return false. If it folds
+ /// the instruction, it is likely that the MachineInstruction the iterator
+ /// references has been changed.
+ virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr* MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ int FrameIndex) const;
+
+ /// foldMemoryOperand - Same as the previous version except it allows folding
+ /// of any load and store from / to any address, not just from a specific
+ /// stack slot.
+ virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr* MI,
+ const SmallVectorImpl<unsigned> &Ops,
+ MachineInstr* LoadMI) const;
+
+ /// canFoldMemoryOperand - Returns true if the specified load / store is
+ /// folding is possible.
+ virtual bool canFoldMemoryOperand(const MachineInstr*,
+ const SmallVectorImpl<unsigned> &) const;
+
+ /// unfoldMemoryOperand - Separate a single instruction which folded a load or
+ /// a store or a load and a store into two or more instruction. If this is
+ /// possible, returns true as well as the new instructions by reference.
+ virtual bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+ unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+ SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+ virtual bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+ SmallVectorImpl<SDNode*> &NewNodes) const;
+
+ /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
+ /// instruction after load / store are unfolded from an instruction of the
+ /// specified opcode. It returns zero if the specified unfolding is not
+ /// possible. If LoadRegIndex is non-null, it is filled in with the operand
+ /// index of the operand which will hold the register holding the loaded
+ /// value.
+ virtual unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+ bool UnfoldLoad, bool UnfoldStore,
+ unsigned *LoadRegIndex = 0) const;
+
+ /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
+ /// to determine if two loads are loading from the same base address. It
+ /// should only return true if the base pointers are the same and the
+ /// only differences between the two addresses are the offset. It also returns
+ /// the offsets by reference.
+ virtual bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+ int64_t &Offset1, int64_t &Offset2) const;
+
+ /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+ /// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should
+ /// be scheduled togther. On some targets if two loads are loading from
+ /// addresses in the same cache line, it's better if they are scheduled
+ /// together. This function takes two integers that represent the load offsets
+ /// from the common base address. It returns true if it decides it's desirable
+ /// to schedule the two loads together. "NumLoads" is the number of loads that
+ /// have already been scheduled after Load1.
+ virtual bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+ int64_t Offset1, int64_t Offset2,
+ unsigned NumLoads) const;
+
+ virtual void getNoopForMachoTarget(MCInst &NopInst) const;
+
+ virtual
+ bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+ /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
+ /// instruction that defines the specified register class.
+ bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+
+ static bool isX86_64NonExtLowByteReg(unsigned reg) {
+ return (reg == X86::SPL || reg == X86::BPL ||
+ reg == X86::SIL || reg == X86::DIL);
+ }
+
+ static bool isX86_64ExtendedReg(const MachineOperand &MO) {
+ if (!MO.isReg()) return false;
+ return isX86_64ExtendedReg(MO.getReg());
+ }
+ static unsigned determineREX(const MachineInstr &MI);
+
+ /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
+ /// higher) register? e.g. r8, xmm8, xmm13, etc.
+ static bool isX86_64ExtendedReg(unsigned RegNo);
+
+ /// GetInstSize - Returns the size of the specified MachineInstr.
+ ///
+ virtual unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+ /// getGlobalBaseReg - Return a virtual register initialized with the
+ /// the global base register value. Output instructions required to
+ /// initialize the register in the function entry block, if necessary.
+ ///
+ unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+ /// GetSSEDomain - Return the SSE execution domain of MI as the first element,
+ /// and a bitmask of possible arguments to SetSSEDomain ase the second.
+ std::pair<uint16_t, uint16_t> GetSSEDomain(const MachineInstr *MI) const;
+
+ /// SetSSEDomain - Set the SSEDomain of MI.
+ void SetSSEDomain(MachineInstr *MI, unsigned Domain) const;
+
+private:
+ MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
+ MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables *LV) const;
+
+ MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+ MachineInstr* MI,
+ unsigned OpNum,
+ const SmallVectorImpl<MachineOperand> &MOs,
+ unsigned Size, unsigned Alignment) const;
+
+ /// isFrameOperand - Return true and the FrameIndex if the specified
+ /// operand and follow operands form a reference to the stack frame.
+ bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
+ int &FrameIndex) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
new file mode 100644
index 0000000..0d59c42
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
@@ -0,0 +1,4895 @@
+//===----------------------------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 specific DAG Nodes.
+//
+
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisInt<0>, SDTCisInt<3>]>;
+
+def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;
+
+def SDTX86Cmov : SDTypeProfile<1, 4,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
+// Unary and binary operator instructions that set EFLAGS as a side-effect.
+def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
+ [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+ [SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>,
+ SDTCisInt<0>, SDTCisVT<1, i32>]>;
+def SDTX86BrCond : SDTypeProfile<0, 3,
+ [SDTCisVT<0, OtherVT>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86SetCC : SDTypeProfile<1, 2,
+ [SDTCisVT<0, i8>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+def SDTX86SetCC_C : SDTypeProfile<1, 2,
+ [SDTCisInt<0>,
+ SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
+ SDTCisVT<2, i8>]>;
+def SDTX86cas8 : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>;
+def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>;
+
+def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>,
+ SDTCisVT<1, i32>]>;
+
+def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
+def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
+ SDTCisVT<1, iPTR>,
+ SDTCisVT<2, iPTR>]>;
+
+def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+
+def SDTX86Void : SDTypeProfile<0, 0, []>;
+
+def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86SegmentBaseAddress : SDTypeProfile<1, 1, [SDTCisPtrTy<0>]>;
+
+def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
+def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>;
+def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>;
+def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>;
+def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>;
+
+def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>;
+def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>;
+
+def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
+def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
+ [SDNPHasChain]>;
+def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>;
+def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
+
+def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
+ [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore,
+ SDNPMayLoad]>;
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8,
+ [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore,
+ SDNPMayLoad]>;
+def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomSub64 : SDNode<"X86ISD::ATOMSUB64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomOr64 : SDNode<"X86ISD::ATOMOR64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomXor64 : SDNode<"X86ISD::ATOMXOR64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomAnd64 : SDNode<"X86ISD::ATOMAND64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomNand64 : SDNode<"X86ISD::ATOMNAND64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary,
+ [SDNPHasChain, SDNPMayStore,
+ SDNPMayLoad, SDNPMemOperand]>;
+def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
+ [SDNPHasChain, SDNPOptInFlag, SDNPVariadic]>;
+
+def X86vastart_save_xmm_regs :
+ SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
+ SDT_X86VASTART_SAVE_XMM_REGS,
+ [SDNPHasChain, SDNPVariadic]>;
+
+def X86callseq_start :
+ SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
+ [SDNPHasChain, SDNPOutFlag]>;
+def X86callseq_end :
+ SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd,
+ [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def X86call : SDNode<"X86ISD::CALL", SDT_X86Call,
+ [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag,
+ SDNPVariadic]>;
+
+def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
+ [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore]>;
+def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
+ [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore,
+ SDNPMayLoad]>;
+
+def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
+ [SDNPHasChain, SDNPOutFlag, SDNPSideEffect]>;
+
+def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>;
+def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>;
+
+def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
+ [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def X86SegmentBaseAddress : SDNode<"X86ISD::SegmentBaseAddress",
+ SDT_X86SegmentBaseAddress, []>;
+
+def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
+ [SDNPHasChain]>;
+
+def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
+ [SDNPHasChain, SDNPOptInFlag, SDNPVariadic]>;
+
+def X86add_flag : SDNode<"X86ISD::ADD", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86sub_flag : SDNode<"X86ISD::SUB", SDTBinaryArithWithFlags>;
+def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86umul_flag : SDNode<"X86ISD::UMUL", SDTUnaryArithWithFlags,
+ [SDNPCommutative]>;
+
+def X86inc_flag : SDNode<"X86ISD::INC", SDTUnaryArithWithFlags>;
+def X86dec_flag : SDNode<"X86ISD::DEC", SDTUnaryArithWithFlags>;
+def X86or_flag : SDNode<"X86ISD::OR", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags,
+ [SDNPCommutative]>;
+
+def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
+
+def X86MingwAlloca : SDNode<"X86ISD::MINGW_ALLOCA", SDTX86Void,
+ [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Operand Definitions.
+//
+
+// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for
+// the index operand of an address, to conform to x86 encoding restrictions.
+def ptr_rc_nosp : PointerLikeRegClass<1>;
+
+// *mem - Operand definitions for the funky X86 addressing mode operands.
+//
+def X86MemAsmOperand : AsmOperandClass {
+ let Name = "Mem";
+ let SuperClasses = [];
+}
+def X86NoSegMemAsmOperand : AsmOperandClass {
+ let Name = "NoSegMem";
+ let SuperClasses = [X86MemAsmOperand];
+}
+def X86AbsMemAsmOperand : AsmOperandClass {
+ let Name = "AbsMem";
+ let SuperClasses = [X86NoSegMemAsmOperand];
+}
+class X86MemOperand<string printMethod> : Operand<iPTR> {
+ let PrintMethod = printMethod;
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+
+def opaque32mem : X86MemOperand<"printopaquemem">;
+def opaque48mem : X86MemOperand<"printopaquemem">;
+def opaque80mem : X86MemOperand<"printopaquemem">;
+def opaque512mem : X86MemOperand<"printopaquemem">;
+
+def i8mem : X86MemOperand<"printi8mem">;
+def i16mem : X86MemOperand<"printi16mem">;
+def i32mem : X86MemOperand<"printi32mem">;
+def i64mem : X86MemOperand<"printi64mem">;
+def i128mem : X86MemOperand<"printi128mem">;
+//def i256mem : X86MemOperand<"printi256mem">;
+def f32mem : X86MemOperand<"printf32mem">;
+def f64mem : X86MemOperand<"printf64mem">;
+def f80mem : X86MemOperand<"printf80mem">;
+def f128mem : X86MemOperand<"printf128mem">;
+//def f256mem : X86MemOperand<"printf256mem">;
+
+// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
+// plain GR64, so that it doesn't potentially require a REX prefix.
+def i8mem_NOREX : Operand<i64> {
+ let PrintMethod = "printi8mem";
+ let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX_NOSP, i32imm, i8imm);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+
+// Special i32mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i32mem_TC : Operand<i32> {
+ let PrintMethod = "printi32mem";
+ let MIOperandInfo = (ops GR32_TC, i8imm, GR32_TC, i32imm, i8imm);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+
+def lea32mem : Operand<i32> {
+ let PrintMethod = "printlea32mem";
+ let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm);
+ let ParserMatchClass = X86NoSegMemAsmOperand;
+}
+
+let ParserMatchClass = X86AbsMemAsmOperand,
+ PrintMethod = "print_pcrel_imm" in {
+def i32imm_pcrel : Operand<i32>;
+
+def offset8 : Operand<i64>;
+def offset16 : Operand<i64>;
+def offset32 : Operand<i64>;
+def offset64 : Operand<i64>;
+
+// Branch targets have OtherVT type and print as pc-relative values.
+def brtarget : Operand<OtherVT>;
+def brtarget8 : Operand<OtherVT>;
+
+}
+
+def SSECC : Operand<i8> {
+ let PrintMethod = "printSSECC";
+}
+
+class ImmSExtAsmOperandClass : AsmOperandClass {
+ let SuperClasses = [ImmAsmOperand];
+ let RenderMethod = "addImmOperands";
+}
+
+// Sign-extended immediate classes. We don't need to define the full lattice
+// here because there is no instruction with an ambiguity between ImmSExti64i32
+// and ImmSExti32i8.
+//
+// The strange ranges come from the fact that the assembler always works with
+// 64-bit immediates, but for a 16-bit target value we want to accept both "-1"
+// (which will be a -1ULL), and "0xFF" (-1 in 16-bits).
+
+// [0, 0x7FFFFFFF] | [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti64i32";
+}
+
+// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] | [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti16i8";
+ let SuperClasses = [ImmSExti64i32AsmOperand];
+}
+
+// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] | [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti32i8";
+}
+
+// [0, 0x0000007F] | [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
+ let Name = "ImmSExti64i8";
+ let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand, ImmSExti64i32AsmOperand];
+}
+
+// A couple of more descriptive operand definitions.
+// 16-bits but only 8 bits are significant.
+def i16i8imm : Operand<i16> {
+ let ParserMatchClass = ImmSExti16i8AsmOperand;
+}
+// 32-bits but only 8 bits are significant.
+def i32i8imm : Operand<i32> {
+ let ParserMatchClass = ImmSExti32i8AsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Complex Pattern Definitions.
+//
+
+// Define X86 specific addressing mode.
+def addr : ComplexPattern<iPTR, 5, "SelectAddr", [], []>;
+def lea32addr : ComplexPattern<i32, 4, "SelectLEAAddr",
+ [add, sub, mul, X86mul_imm, shl, or, frameindex],
+ []>;
+def tls32addr : ComplexPattern<i32, 4, "SelectTLSADDRAddr",
+ [tglobaltlsaddr], []>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Predicate Definitions.
+def HasCMov : Predicate<"Subtarget->hasCMov()">;
+def NoCMov : Predicate<"!Subtarget->hasCMov()">;
+def HasMMX : Predicate<"Subtarget->hasMMX()">;
+def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
+def HasSSE2 : Predicate<"Subtarget->hasSSE2()">;
+def HasSSE3 : Predicate<"Subtarget->hasSSE3()">;
+def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">;
+def HasSSE41 : Predicate<"Subtarget->hasSSE41()">;
+def HasSSE42 : Predicate<"Subtarget->hasSSE42()">;
+def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">;
+def HasAVX : Predicate<"Subtarget->hasAVX()">;
+def HasFMA3 : Predicate<"Subtarget->hasFMA3()">;
+def HasFMA4 : Predicate<"Subtarget->hasFMA4()">;
+def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
+def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
+def In32BitMode : Predicate<"!Subtarget->is64Bit()">;
+def In64BitMode : Predicate<"Subtarget->is64Bit()">;
+def IsWin64 : Predicate<"Subtarget->isTargetWin64()">;
+def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">;
+def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
+def FarData : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
+ "TM.getCodeModel() != CodeModel::Kernel">;
+def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
+ "TM.getCodeModel() == CodeModel::Kernel">;
+def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">;
+def OptForSize : Predicate<"OptForSize">;
+def OptForSpeed : Predicate<"!OptForSize">;
+def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
+def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
+def HasAES : Predicate<"Subtarget->hasAES()">;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+include "X86InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments...
+//
+
+// X86 specific condition code. These correspond to CondCode in
+// X86InstrInfo.h. They must be kept in synch.
+def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE
+def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC
+def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C
+def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA
+def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z
+def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE
+def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL
+def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE
+def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG
+def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ
+def X86_COND_NO : PatLeaf<(i8 10)>;
+def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO
+def X86_COND_NS : PatLeaf<(i8 12)>;
+def X86_COND_O : PatLeaf<(i8 13)>;
+def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE
+def X86_COND_S : PatLeaf<(i8 15)>;
+
+def immSext8 : PatLeaf<(imm), [{
+ return N->getSExtValue() == (int8_t)N->getSExtValue();
+}]>;
+
+def i16immSExt8 : PatLeaf<(i16 immSext8)>;
+def i32immSExt8 : PatLeaf<(i32 immSext8)>;
+
+/// Load patterns: these constraint the match to the right address space.
+def dsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ if (PT->getAddressSpace() > 255)
+ return false;
+ return true;
+}]>;
+
+def gsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ return PT->getAddressSpace() == 256;
+ return false;
+}]>;
+
+def fsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ return PT->getAddressSpace() == 257;
+ return false;
+}]>;
+
+
+// Helper fragments for loads.
+// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
+// known to be 32-bit aligned or better. Ditto for i8 to i16.
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ if (const Value *Src = LD->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ if (PT->getAddressSpace() > 255)
+ return false;
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::NON_EXTLOAD)
+ return true;
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 2 && !LD->isVolatile();
+ return false;
+}]>;
+
+def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ if (const Value *Src = LD->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ if (PT->getAddressSpace() > 255)
+ return false;
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 2 && !LD->isVolatile();
+ return false;
+}]>;
+
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ if (const Value *Src = LD->getSrcValue())
+ if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+ if (PT->getAddressSpace() > 255)
+ return false;
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ if (ExtType == ISD::NON_EXTLOAD)
+ return true;
+ if (ExtType == ISD::EXTLOAD)
+ return LD->getAlignment() >= 4 && !LD->isVolatile();
+ return false;
+}]>;
+
+def loadi8 : PatFrag<(ops node:$ptr), (i8 (dsload node:$ptr))>;
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (dsload node:$ptr))>;
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (dsload node:$ptr))>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (dsload node:$ptr))>;
+def loadf80 : PatFrag<(ops node:$ptr), (f80 (dsload node:$ptr))>;
+
+def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+
+def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>;
+def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+
+def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>;
+def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+// An 'srl' node with a single use.
+def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
+ return N->hasOneUse();
+}]>;
+// An 'trunc' node with a single use.
+def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
+ return N->hasOneUse();
+}]>;
+
+// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
+def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+ return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
+
+ unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
+ APInt Mask = APInt::getAllOnesValue(BitWidth);
+ APInt KnownZero0, KnownOne0;
+ CurDAG->ComputeMaskedBits(N->getOperand(0), Mask, KnownZero0, KnownOne0, 0);
+ APInt KnownZero1, KnownOne1;
+ CurDAG->ComputeMaskedBits(N->getOperand(1), Mask, KnownZero1, KnownOne1, 0);
+ return (~KnownZero0 & ~KnownZero1) == 0;
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list...
+//
+
+// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [ESP, EFLAGS], Uses = [ESP] in {
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+ "#ADJCALLSTACKDOWN",
+ [(X86callseq_start timm:$amt)]>,
+ Requires<[In32BitMode]>;
+def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKUP",
+ [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+ Requires<[In32BitMode]>;
+}
+
+// x86-64 va_start lowering magic.
+let usesCustomInserter = 1 in {
+def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
+ (outs),
+ (ins GR8:$al,
+ i64imm:$regsavefi, i64imm:$offset,
+ variable_ops),
+ "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
+ [(X86vastart_save_xmm_regs GR8:$al,
+ imm:$regsavefi,
+ imm:$offset)]>;
+
+// Dynamic stack allocation yields _alloca call for Cygwin/Mingw targets. Calls
+// to _alloca is needed to probe the stack when allocating more than 4k bytes in
+// one go. Touching the stack at 4K increments is necessary to ensure that the
+// guard pages used by the OS virtual memory manager are allocated in correct
+// sequence.
+// The main point of having separate instruction are extra unmodelled effects
+// (compared to ordinary calls) like stack pointer change.
+
+def MINGW_ALLOCA : I<0, Pseudo, (outs), (ins),
+ "# dynamic stack allocation",
+ [(X86MingwAlloca)]>;
+}
+
+// Nop
+let neverHasSideEffects = 1 in {
+ def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>;
+ def NOOPW : I<0x1f, MRM0m, (outs), (ins i16mem:$zero),
+ "nop{w}\t$zero", []>, TB, OpSize;
+ def NOOPL : I<0x1f, MRM0m, (outs), (ins i32mem:$zero),
+ "nop{l}\t$zero", []>, TB;
+}
+
+// Trap
+def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
+def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", []>;
+// FIXME: need to make sure that "int $3" matches int3
+def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap", []>;
+def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", []>, OpSize;
+def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l}", []>;
+
+// PIC base construction. This expands to code that looks like this:
+// call $next_inst
+// popl %destreg"
+let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in
+ def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
+ "", []>;
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions.
+//
+
+// Return instructions.
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, FPForm = SpecialFP in {
+ def RET : I <0xC3, RawFrm, (outs), (ins variable_ops),
+ "ret",
+ [(X86retflag 0)]>;
+ def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+ "ret\t$amt",
+ [(X86retflag timm:$amt)]>;
+ def LRET : I <0xCB, RawFrm, (outs), (ins),
+ "lret", []>;
+ def LRETI : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+ "lret\t$amt", []>;
+}
+
+// Unconditional branches.
+let isBarrier = 1, isBranch = 1, isTerminator = 1 in {
+ def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst),
+ "jmp\t$dst", [(br bb:$dst)]>;
+ def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
+ "jmp\t$dst", []>;
+}
+
+// Conditional Branches.
+let isBranch = 1, isTerminator = 1, Uses = [EFLAGS] in {
+ multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
+ def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, []>;
+ def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm,
+ [(X86brcond bb:$dst, Cond, EFLAGS)]>, TB;
+ }
+}
+
+defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>;
+defm JNO : ICBr<0x71, 0x81, "jno\t$dst" , X86_COND_NO>;
+defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>;
+defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>;
+defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>;
+defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>;
+defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>;
+defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>;
+defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>;
+defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>;
+defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>;
+defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>;
+defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>;
+defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>;
+defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
+defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
+
+// FIXME: What about the CX/RCX versions of this instruction?
+let Uses = [ECX], isBranch = 1, isTerminator = 1 in
+ def JCXZ8 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+ "jcxz\t$dst", []>;
+
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+ def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
+ [(brind GR32:$dst)]>;
+ def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
+ [(brind (loadi32 addr:$dst))]>;
+
+ def FARJMP16i : Iseg16<0xEA, RawFrm, (outs),
+ (ins i16imm:$seg, i16imm:$off),
+ "ljmp{w}\t$seg, $off", []>, OpSize;
+ def FARJMP32i : Iseg32<0xEA, RawFrm, (outs),
+ (ins i16imm:$seg, i32imm:$off),
+ "ljmp{l}\t$seg, $off", []>;
+
+ def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
+ "ljmp{w}\t{*}$dst", []>, OpSize;
+ def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
+ "ljmp{l}\t{*}$dst", []>;
+}
+
+
+// Loop instructions
+
+def LOOP : I<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", []>;
+def LOOPE : I<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", []>;
+def LOOPNE : I<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", []>;
+
+//===----------------------------------------------------------------------===//
+// Call Instructions...
+//
+let isCall = 1 in
+ // All calls clobber the non-callee saved registers. ESP is marked as
+ // a use to prevent stack-pointer assignments that appear immediately
+ // before calls from potentially appearing dead. Uses for argument
+ // registers are added manually.
+ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ Uses = [ESP] in {
+ def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
+ (outs), (ins i32imm_pcrel:$dst,variable_ops),
+ "call\t$dst", []>;
+ def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst, variable_ops),
+ "call\t{*}$dst", [(X86call GR32:$dst)]>;
+ def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops),
+ "call\t{*}$dst", [(X86call (loadi32 addr:$dst))]>;
+
+ def FARCALL16i : Iseg16<0x9A, RawFrm, (outs),
+ (ins i16imm:$seg, i16imm:$off),
+ "lcall{w}\t$seg, $off", []>, OpSize;
+ def FARCALL32i : Iseg32<0x9A, RawFrm, (outs),
+ (ins i16imm:$seg, i32imm:$off),
+ "lcall{l}\t$seg, $off", []>;
+
+ def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
+ "lcall{w}\t{*}$dst", []>, OpSize;
+ def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
+ "lcall{l}\t{*}$dst", []>;
+ }
+
+// Constructing a stack frame.
+
+def ENTER : I<0xC8, RawFrm, (outs), (ins i16imm:$len, i8imm:$lvl),
+ "enter\t$len, $lvl", []>;
+
+// Tail call stuff.
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ Uses = [ESP] in {
+ def TCRETURNdi : I<0, Pseudo, (outs),
+ (ins i32imm_pcrel:$dst, i32imm:$offset, variable_ops),
+ "#TC_RETURN $dst $offset", []>;
+ def TCRETURNri : I<0, Pseudo, (outs),
+ (ins GR32_TC:$dst, i32imm:$offset, variable_ops),
+ "#TC_RETURN $dst $offset", []>;
+ let mayLoad = 1 in
+ def TCRETURNmi : I<0, Pseudo, (outs),
+ (ins i32mem_TC:$dst, i32imm:$offset, variable_ops),
+ "#TC_RETURN $dst $offset", []>;
+
+ // FIXME: The should be pseudo instructions that are lowered when going to
+ // mcinst.
+ def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
+ (ins i32imm_pcrel:$dst, variable_ops),
+ "jmp\t$dst # TAILCALL",
+ []>;
+ def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32_TC:$dst, variable_ops),
+ "jmp{l}\t{*}$dst # TAILCALL",
+ []>;
+ let mayLoad = 1 in
+ def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst, variable_ops),
+ "jmp{l}\t{*}$dst # TAILCALL", []>;
+
+ // FIXME: This is a hack so that MCInst lowering can preserve the TAILCALL
+ // marker on instructions, while still being able to relax.
+ let isCodeGenOnly = 1 in {
+ def TAILJMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
+ "jmp\t$dst # TAILCALL", []>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions...
+//
+let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in
+def LEAVE : I<0xC9, RawFrm,
+ (outs), (ins), "leave", []>;
+
+def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "popcnt{w}\t{$src, $dst|$dst, $src}", []>, OpSize, XS;
+let mayLoad = 1 in
+def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "popcnt{w}\t{$src, $dst|$dst, $src}", []>, OpSize, XS;
+def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "popcnt{l}\t{$src, $dst|$dst, $src}", []>, XS;
+let mayLoad = 1 in
+def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "popcnt{l}\t{$src, $dst|$dst, $src}", []>, XS;
+
+let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in {
+let mayLoad = 1 in {
+def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
+ OpSize;
+def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>;
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
+ OpSize;
+def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", []>,
+ OpSize;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>;
+def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", []>;
+}
+
+let mayStore = 1 in {
+def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
+ OpSize;
+def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>;
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
+ OpSize;
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[]>,
+ OpSize;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[]>;
+}
+}
+
+let Defs = [ESP], Uses = [ESP], neverHasSideEffects = 1, mayStore = 1 in {
+def PUSHi8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
+ "push{l}\t$imm", []>;
+def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
+ "push{w}\t$imm", []>, OpSize;
+def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
+ "push{l}\t$imm", []>;
+}
+
+let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, neverHasSideEffects=1 in {
+def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", []>, OpSize;
+def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", []>,
+ Requires<[In32BitMode]>;
+}
+let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in {
+def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", []>, OpSize;
+def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>,
+ Requires<[In32BitMode]>;
+}
+
+let isTwoAddress = 1 in // GR32 = bswap GR32
+ def BSWAP32r : I<0xC8, AddRegFrm,
+ (outs GR32:$dst), (ins GR32:$src),
+ "bswap{l}\t$dst",
+ [(set GR32:$dst, (bswap GR32:$src))]>, TB;
+
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "bsf{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>, TB, OpSize;
+def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bsf{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>, TB,
+ OpSize;
+def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "bsf{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>, TB;
+def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bsf{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>, TB;
+
+def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "bsr{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>, TB, OpSize;
+def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "bsr{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>, TB,
+ OpSize;
+def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "bsr{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>, TB;
+def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "bsr{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>, TB;
+} // Defs = [EFLAGS]
+
+let neverHasSideEffects = 1 in
+def LEA16r : I<0x8D, MRMSrcMem,
+ (outs GR16:$dst), (ins lea32mem:$src),
+ "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize;
+let isReMaterializable = 1 in
+def LEA32r : I<0x8D, MRMSrcMem,
+ (outs GR32:$dst), (ins lea32mem:$src),
+ "lea{l}\t{$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea32addr:$src)]>, Requires<[In32BitMode]>;
+
+let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
+def REP_MOVSB : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+ [(X86rep_movs i8)]>, REP;
+def REP_MOVSW : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+ [(X86rep_movs i16)]>, REP, OpSize;
+def REP_MOVSD : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+ [(X86rep_movs i32)]>, REP;
+}
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
+def MOVSB : I<0xA4, RawFrm, (outs), (ins), "{movsb}", []>;
+def MOVSW : I<0xA5, RawFrm, (outs), (ins), "{movsw}", []>, OpSize;
+def MOVSD : I<0xA5, RawFrm, (outs), (ins), "{movsl|movsd}", []>;
+}
+
+let Defs = [ECX,EDI], Uses = [AL,ECX,EDI], isCodeGenOnly = 1 in
+def REP_STOSB : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+ [(X86rep_stos i8)]>, REP;
+let Defs = [ECX,EDI], Uses = [AX,ECX,EDI], isCodeGenOnly = 1 in
+def REP_STOSW : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+ [(X86rep_stos i16)]>, REP, OpSize;
+let Defs = [ECX,EDI], Uses = [EAX,ECX,EDI], isCodeGenOnly = 1 in
+def REP_STOSD : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+ [(X86rep_stos i32)]>, REP;
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
+def STOSB : I<0xAA, RawFrm, (outs), (ins), "{stosb}", []>;
+let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
+def STOSW : I<0xAB, RawFrm, (outs), (ins), "{stosw}", []>, OpSize;
+let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
+def STOSD : I<0xAB, RawFrm, (outs), (ins), "{stosl|stosd}", []>;
+
+def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scas{b}", []>;
+def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scas{w}", []>, OpSize;
+def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l}", []>;
+
+def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmps{b}", []>;
+def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmps{w}", []>, OpSize;
+def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l}", []>;
+
+let Defs = [RAX, RDX] in
+def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>,
+ TB;
+
+let Defs = [RAX, RCX, RDX] in
+def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB;
+
+let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in {
+def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
+}
+
+def SYSCALL : I<0x05, RawFrm,
+ (outs), (ins), "syscall", []>, TB;
+def SYSRET : I<0x07, RawFrm,
+ (outs), (ins), "sysret", []>, TB;
+def SYSENTER : I<0x34, RawFrm,
+ (outs), (ins), "sysenter", []>, TB;
+def SYSEXIT : I<0x35, RawFrm,
+ (outs), (ins), "sysexit", []>, TB;
+
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>;
+
+
+//===----------------------------------------------------------------------===//
+// Input/Output Instructions...
+//
+let Defs = [AL], Uses = [DX] in
+def IN8rr : I<0xEC, RawFrm, (outs), (ins),
+ "in{b}\t{%dx, %al|%AL, %DX}", []>;
+let Defs = [AX], Uses = [DX] in
+def IN16rr : I<0xED, RawFrm, (outs), (ins),
+ "in{w}\t{%dx, %ax|%AX, %DX}", []>, OpSize;
+let Defs = [EAX], Uses = [DX] in
+def IN32rr : I<0xED, RawFrm, (outs), (ins),
+ "in{l}\t{%dx, %eax|%EAX, %DX}", []>;
+
+let Defs = [AL] in
+def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i16i8imm:$port),
+ "in{b}\t{$port, %al|%AL, $port}", []>;
+let Defs = [AX] in
+def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i16i8imm:$port),
+ "in{w}\t{$port, %ax|%AX, $port}", []>, OpSize;
+let Defs = [EAX] in
+def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i16i8imm:$port),
+ "in{l}\t{$port, %eax|%EAX, $port}", []>;
+
+let Uses = [DX, AL] in
+def OUT8rr : I<0xEE, RawFrm, (outs), (ins),
+ "out{b}\t{%al, %dx|%DX, %AL}", []>;
+let Uses = [DX, AX] in
+def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
+ "out{w}\t{%ax, %dx|%DX, %AX}", []>, OpSize;
+let Uses = [DX, EAX] in
+def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
+ "out{l}\t{%eax, %dx|%DX, %EAX}", []>;
+
+let Uses = [AL] in
+def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i16i8imm:$port),
+ "out{b}\t{%al, $port|$port, %AL}", []>;
+let Uses = [AX] in
+def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i16i8imm:$port),
+ "out{w}\t{%ax, $port|$port, %AX}", []>, OpSize;
+let Uses = [EAX] in
+def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i16i8imm:$port),
+ "out{l}\t{%eax, $port|$port, %EAX}", []>;
+
+def IN8 : I<0x6C, RawFrm, (outs), (ins),
+ "ins{b}", []>;
+def IN16 : I<0x6D, RawFrm, (outs), (ins),
+ "ins{w}", []>, OpSize;
+def IN32 : I<0x6D, RawFrm, (outs), (ins),
+ "ins{l}", []>;
+
+//===----------------------------------------------------------------------===//
+// Move Instructions...
+//
+let neverHasSideEffects = 1 in {
+def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", []>;
+def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>;
+}
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(set GR8:$dst, imm:$src)]>;
+def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, imm:$src)]>, OpSize;
+def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, imm:$src)]>;
+}
+
+def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(store (i8 imm:$src), addr:$dst)]>;
+def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(store (i16 imm:$src), addr:$dst)]>, OpSize;
+def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(store (i32 imm:$src), addr:$dst)]>;
+
+/// moffs8, moffs16 and moffs32 versions of moves. The immediate is a
+/// 32-bit offset from the PC. These are only valid in x86-32 mode.
+def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src),
+ "mov{b}\t{$src, %al|%al, $src}", []>;
+def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src),
+ "mov{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
+ "mov{l}\t{$src, %eax|%eax, $src}", []>;
+def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins),
+ "mov{b}\t{%al, $dst|$dst, %al}", []>;
+def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins),
+ "mov{w}\t{%ax, $dst|$dst, %ax}", []>, OpSize;
+def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
+ "mov{l}\t{%eax, $dst|$dst, %eax}", []>;
+
+// Moves to and from segment registers
+def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>;
+def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32ms : I<0x8C, MRMDestMem, (outs i32mem:$dst), (ins SEGMENT_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>;
+def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>;
+def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>;
+
+let isCodeGenOnly = 1 in {
+def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
+ "mov{b}\t{$src, $dst|$dst, $src}", []>;
+def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(set GR8:$dst, (loadi8 addr:$src))]>;
+def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (loadi16 addr:$src))]>, OpSize;
+def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (loadi32 addr:$src))]>;
+}
+
+def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
+ "mov{b}\t{$src, $dst|$dst, $src}",
+ [(store GR8:$src, addr:$dst)]>;
+def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "mov{w}\t{$src, $dst|$dst, $src}",
+ [(store GR16:$src, addr:$dst)]>, OpSize;
+def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ [(store GR32:$src, addr:$dst)]>;
+
+/// Versions of MOV32rr, MOV32rm, and MOV32mr for i32mem_TC and GR32_TC.
+let neverHasSideEffects = 1 in
+def MOV32rr_TC : I<0x89, MRMDestReg, (outs GR32_TC:$dst), (ins GR32_TC:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>;
+
+let mayLoad = 1,
+ canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOV32rm_TC : I<0x8B, MRMSrcMem, (outs GR32_TC:$dst), (ins i32mem_TC:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ []>;
+
+let mayStore = 1 in
+def MOV32mr_TC : I<0x89, MRMDestMem, (outs), (ins i32mem_TC:$dst, GR32_TC:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}",
+ []>;
+
+// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
+// that they can be used for copying and storing h registers, which can't be
+// encoded when a REX prefix is present.
+let neverHasSideEffects = 1 in
+def MOV8rr_NOREX : I<0x88, MRMDestReg,
+ (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>;
+let mayStore = 1 in
+def MOV8mr_NOREX : I<0x88, MRMDestMem,
+ (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>;
+let mayLoad = 1,
+ canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
+ (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
+ "mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>;
+
+// Moves to and from debug registers
+def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+
+// Moves to and from control registers
+def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
+ "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// Fixed-Register Multiplication and Division Instructions...
+//
+
+// Extra precision multiplication
+
+// AL is really implied by AX, by the registers in Defs must match the
+// SDNode results (i8, i32).
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src",
+ // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+ // This probably ought to be moved to a def : Pat<> if the
+ // syntax can be accepted.
+ [(set AL, (mul AL, GR8:$src)),
+ (implicit EFLAGS)]>; // AL,AH = AL*GR8
+
+let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in
+def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src),
+ "mul{w}\t$src",
+ []>, OpSize; // AX,DX = AX*GR16
+
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in
+def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src),
+ "mul{l}\t$src",
+ []>; // EAX,EDX = EAX*GR32
+
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
+ "mul{b}\t$src",
+ // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+ // This probably ought to be moved to a def : Pat<> if the
+ // syntax can be accepted.
+ [(set AL, (mul AL, (loadi8 addr:$src))),
+ (implicit EFLAGS)]>; // AL,AH = AL*[mem8]
+
+let mayLoad = 1, neverHasSideEffects = 1 in {
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
+ "mul{w}\t$src",
+ []>, OpSize; // AX,DX = AX*[mem16]
+
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
+ "mul{l}\t$src",
+ []>; // EAX,EDX = EAX*[mem32]
+}
+
+let neverHasSideEffects = 1 in {
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", []>;
+ // AL,AH = AL*GR8
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16r : I<0xF7, MRM5r, (outs), (ins GR16:$src), "imul{w}\t$src", []>,
+ OpSize; // AX,DX = AX*GR16
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32r : I<0xF7, MRM5r, (outs), (ins GR32:$src), "imul{l}\t$src", []>;
+ // EAX,EDX = EAX*GR32
+let mayLoad = 1 in {
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8m : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
+ "imul{b}\t$src", []>; // AL,AH = AL*[mem8]
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
+ "imul{w}\t$src", []>, OpSize; // AX,DX = AX*[mem16]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
+ "imul{l}\t$src", []>; // EAX,EDX = EAX*[mem32]
+}
+} // neverHasSideEffects
+
+// unsigned division/remainder
+let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
+ "div{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16r : I<0xF7, MRM6r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
+ "div{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def DIV32r : I<0xF7, MRM6r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
+ "div{l}\t$src", []>;
+let mayLoad = 1 in {
+let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
+ "div{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
+ "div{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+ // EDX:EAX/[mem32] = EAX,EDX
+def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
+ "div{l}\t$src", []>;
+}
+
+// Signed division/remainder.
+let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
+ "idiv{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16r: I<0xF7, MRM7r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX
+ "idiv{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32r: I<0xF7, MRM7r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX
+ "idiv{l}\t$src", []>;
+let mayLoad = 1, mayLoad = 1 in {
+let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH
+ "idiv{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
+ "idiv{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
+ // EDX:EAX/[mem32] = EAX,EDX
+ "idiv{l}\t$src", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Two address Instructions.
+//
+let isTwoAddress = 1 in {
+
+// Conditional moves
+let Uses = [EFLAGS] in {
+
+let Predicates = [HasCMov] in {
+let isCommutable = 1 in {
+def CMOVB16rr : I<0x42, MRMSrcReg, // if <u, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovb{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_B, EFLAGS))]>,
+ TB, OpSize;
+def CMOVB32rr : I<0x42, MRMSrcReg, // if <u, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovb{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_B, EFLAGS))]>,
+ TB;
+def CMOVAE16rr: I<0x43, MRMSrcReg, // if >=u, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovae{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_AE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVAE32rr: I<0x43, MRMSrcReg, // if >=u, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovae{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_AE, EFLAGS))]>,
+ TB;
+def CMOVE16rr : I<0x44, MRMSrcReg, // if ==, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmove{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_E, EFLAGS))]>,
+ TB, OpSize;
+def CMOVE32rr : I<0x44, MRMSrcReg, // if ==, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmove{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_E, EFLAGS))]>,
+ TB;
+def CMOVNE16rr: I<0x45, MRMSrcReg, // if !=, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovne{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_NE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNE32rr: I<0x45, MRMSrcReg, // if !=, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovne{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_NE, EFLAGS))]>,
+ TB;
+def CMOVBE16rr: I<0x46, MRMSrcReg, // if <=u, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovbe{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_BE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVBE32rr: I<0x46, MRMSrcReg, // if <=u, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovbe{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_BE, EFLAGS))]>,
+ TB;
+def CMOVA16rr : I<0x47, MRMSrcReg, // if >u, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmova{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_A, EFLAGS))]>,
+ TB, OpSize;
+def CMOVA32rr : I<0x47, MRMSrcReg, // if >u, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmova{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_A, EFLAGS))]>,
+ TB;
+def CMOVL16rr : I<0x4C, MRMSrcReg, // if <s, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovl{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_L, EFLAGS))]>,
+ TB, OpSize;
+def CMOVL32rr : I<0x4C, MRMSrcReg, // if <s, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovl{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_L, EFLAGS))]>,
+ TB;
+def CMOVGE16rr: I<0x4D, MRMSrcReg, // if >=s, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovge{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_GE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVGE32rr: I<0x4D, MRMSrcReg, // if >=s, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovge{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_GE, EFLAGS))]>,
+ TB;
+def CMOVLE16rr: I<0x4E, MRMSrcReg, // if <=s, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovle{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_LE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVLE32rr: I<0x4E, MRMSrcReg, // if <=s, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovle{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_LE, EFLAGS))]>,
+ TB;
+def CMOVG16rr : I<0x4F, MRMSrcReg, // if >s, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovg{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_G, EFLAGS))]>,
+ TB, OpSize;
+def CMOVG32rr : I<0x4F, MRMSrcReg, // if >s, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovg{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_G, EFLAGS))]>,
+ TB;
+def CMOVS16rr : I<0x48, MRMSrcReg, // if signed, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovs{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_S, EFLAGS))]>,
+ TB, OpSize;
+def CMOVS32rr : I<0x48, MRMSrcReg, // if signed, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovs{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_S, EFLAGS))]>,
+ TB;
+def CMOVNS16rr: I<0x49, MRMSrcReg, // if !signed, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovns{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_NS, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNS32rr: I<0x49, MRMSrcReg, // if !signed, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovns{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_NS, EFLAGS))]>,
+ TB;
+def CMOVP16rr : I<0x4A, MRMSrcReg, // if parity, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovp{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_P, EFLAGS))]>,
+ TB, OpSize;
+def CMOVP32rr : I<0x4A, MRMSrcReg, // if parity, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovp{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_P, EFLAGS))]>,
+ TB;
+def CMOVNP16rr : I<0x4B, MRMSrcReg, // if !parity, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovnp{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_NP, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNP32rr : I<0x4B, MRMSrcReg, // if !parity, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovnp{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_NP, EFLAGS))]>,
+ TB;
+def CMOVO16rr : I<0x40, MRMSrcReg, // if overflow, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovo{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_O, EFLAGS))]>,
+ TB, OpSize;
+def CMOVO32rr : I<0x40, MRMSrcReg, // if overflow, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovo{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_O, EFLAGS))]>,
+ TB;
+def CMOVNO16rr : I<0x41, MRMSrcReg, // if !overflow, GR16 = GR16
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "cmovno{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_NO, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNO32rr : I<0x41, MRMSrcReg, // if !overflow, GR32 = GR32
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "cmovno{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_NO, EFLAGS))]>,
+ TB;
+} // isCommutable = 1
+
+def CMOVB16rm : I<0x42, MRMSrcMem, // if <u, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovb{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_B, EFLAGS))]>,
+ TB, OpSize;
+def CMOVB32rm : I<0x42, MRMSrcMem, // if <u, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovb{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_B, EFLAGS))]>,
+ TB;
+def CMOVAE16rm: I<0x43, MRMSrcMem, // if >=u, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovae{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_AE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVAE32rm: I<0x43, MRMSrcMem, // if >=u, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovae{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_AE, EFLAGS))]>,
+ TB;
+def CMOVE16rm : I<0x44, MRMSrcMem, // if ==, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmove{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_E, EFLAGS))]>,
+ TB, OpSize;
+def CMOVE32rm : I<0x44, MRMSrcMem, // if ==, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmove{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_E, EFLAGS))]>,
+ TB;
+def CMOVNE16rm: I<0x45, MRMSrcMem, // if !=, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovne{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_NE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNE32rm: I<0x45, MRMSrcMem, // if !=, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovne{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_NE, EFLAGS))]>,
+ TB;
+def CMOVBE16rm: I<0x46, MRMSrcMem, // if <=u, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovbe{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_BE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVBE32rm: I<0x46, MRMSrcMem, // if <=u, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovbe{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_BE, EFLAGS))]>,
+ TB;
+def CMOVA16rm : I<0x47, MRMSrcMem, // if >u, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmova{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_A, EFLAGS))]>,
+ TB, OpSize;
+def CMOVA32rm : I<0x47, MRMSrcMem, // if >u, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmova{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_A, EFLAGS))]>,
+ TB;
+def CMOVL16rm : I<0x4C, MRMSrcMem, // if <s, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovl{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_L, EFLAGS))]>,
+ TB, OpSize;
+def CMOVL32rm : I<0x4C, MRMSrcMem, // if <s, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovl{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_L, EFLAGS))]>,
+ TB;
+def CMOVGE16rm: I<0x4D, MRMSrcMem, // if >=s, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovge{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_GE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVGE32rm: I<0x4D, MRMSrcMem, // if >=s, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovge{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_GE, EFLAGS))]>,
+ TB;
+def CMOVLE16rm: I<0x4E, MRMSrcMem, // if <=s, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovle{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_LE, EFLAGS))]>,
+ TB, OpSize;
+def CMOVLE32rm: I<0x4E, MRMSrcMem, // if <=s, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovle{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_LE, EFLAGS))]>,
+ TB;
+def CMOVG16rm : I<0x4F, MRMSrcMem, // if >s, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovg{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_G, EFLAGS))]>,
+ TB, OpSize;
+def CMOVG32rm : I<0x4F, MRMSrcMem, // if >s, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovg{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_G, EFLAGS))]>,
+ TB;
+def CMOVS16rm : I<0x48, MRMSrcMem, // if signed, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovs{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_S, EFLAGS))]>,
+ TB, OpSize;
+def CMOVS32rm : I<0x48, MRMSrcMem, // if signed, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovs{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_S, EFLAGS))]>,
+ TB;
+def CMOVNS16rm: I<0x49, MRMSrcMem, // if !signed, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovns{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_NS, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNS32rm: I<0x49, MRMSrcMem, // if !signed, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovns{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_NS, EFLAGS))]>,
+ TB;
+def CMOVP16rm : I<0x4A, MRMSrcMem, // if parity, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovp{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_P, EFLAGS))]>,
+ TB, OpSize;
+def CMOVP32rm : I<0x4A, MRMSrcMem, // if parity, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovp{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_P, EFLAGS))]>,
+ TB;
+def CMOVNP16rm : I<0x4B, MRMSrcMem, // if !parity, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovnp{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_NP, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNP32rm : I<0x4B, MRMSrcMem, // if !parity, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovnp{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_NP, EFLAGS))]>,
+ TB;
+def CMOVO16rm : I<0x40, MRMSrcMem, // if overflow, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovo{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_O, EFLAGS))]>,
+ TB, OpSize;
+def CMOVO32rm : I<0x40, MRMSrcMem, // if overflow, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovo{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_O, EFLAGS))]>,
+ TB;
+def CMOVNO16rm : I<0x41, MRMSrcMem, // if !overflow, GR16 = [mem16]
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "cmovno{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_NO, EFLAGS))]>,
+ TB, OpSize;
+def CMOVNO32rm : I<0x41, MRMSrcMem, // if !overflow, GR32 = [mem32]
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "cmovno{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_NO, EFLAGS))]>,
+ TB;
+} // Predicates = [HasCMov]
+
+// X86 doesn't have 8-bit conditional moves. Use a customInserter to
+// emit control flow. An alternative to this is to mark i8 SELECT as Promote,
+// however that requires promoting the operands, and can induce additional
+// i8 register pressure. Note that CMOV_GR8 is conservatively considered to
+// clobber EFLAGS, because if one of the operands is zero, the expansion
+// could involve an xor.
+let usesCustomInserter = 1, isTwoAddress = 0, Defs = [EFLAGS] in {
+def CMOV_GR8 : I<0, Pseudo,
+ (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond),
+ "#CMOV_GR8 PSEUDO!",
+ [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2,
+ imm:$cond, EFLAGS))]>;
+
+let Predicates = [NoCMov] in {
+def CMOV_GR32 : I<0, Pseudo,
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond),
+ "#CMOV_GR32* PSEUDO!",
+ [(set GR32:$dst,
+ (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>;
+def CMOV_GR16 : I<0, Pseudo,
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond),
+ "#CMOV_GR16* PSEUDO!",
+ [(set GR16:$dst,
+ (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>;
+def CMOV_RFP32 : I<0, Pseudo,
+ (outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2, i8imm:$cond),
+ "#CMOV_RFP32 PSEUDO!",
+ [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond,
+ EFLAGS))]>;
+def CMOV_RFP64 : I<0, Pseudo,
+ (outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2, i8imm:$cond),
+ "#CMOV_RFP64 PSEUDO!",
+ [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond,
+ EFLAGS))]>;
+def CMOV_RFP80 : I<0, Pseudo,
+ (outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2, i8imm:$cond),
+ "#CMOV_RFP80 PSEUDO!",
+ [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
+ EFLAGS))]>;
+} // Predicates = [NoCMov]
+} // UsesCustomInserter = 1, isTwoAddress = 0, Defs = [EFLAGS]
+} // Uses = [EFLAGS]
+
+
+// unary instructions
+let CodeSize = 2 in {
+let Defs = [EFLAGS] in {
+def NEG8r : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src), "neg{b}\t$dst",
+ [(set GR8:$dst, (ineg GR8:$src)),
+ (implicit EFLAGS)]>;
+def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src), "neg{w}\t$dst",
+ [(set GR16:$dst, (ineg GR16:$src)),
+ (implicit EFLAGS)]>, OpSize;
+def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src), "neg{l}\t$dst",
+ [(set GR32:$dst, (ineg GR32:$src)),
+ (implicit EFLAGS)]>;
+let isTwoAddress = 0 in {
+ def NEG8m : I<0xF6, MRM3m, (outs), (ins i8mem :$dst), "neg{b}\t$dst",
+ [(store (ineg (loadi8 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)]>;
+ def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst), "neg{w}\t$dst",
+ [(store (ineg (loadi16 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst), "neg{l}\t$dst",
+ [(store (ineg (loadi32 addr:$dst)), addr:$dst),
+ (implicit EFLAGS)]>;
+}
+} // Defs = [EFLAGS]
+
+// Match xor -1 to not. Favors these over a move imm + xor to save code size.
+let AddedComplexity = 15 in {
+def NOT8r : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src), "not{b}\t$dst",
+ [(set GR8:$dst, (not GR8:$src))]>;
+def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src), "not{w}\t$dst",
+ [(set GR16:$dst, (not GR16:$src))]>, OpSize;
+def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src), "not{l}\t$dst",
+ [(set GR32:$dst, (not GR32:$src))]>;
+}
+let isTwoAddress = 0 in {
+ def NOT8m : I<0xF6, MRM2m, (outs), (ins i8mem :$dst), "not{b}\t$dst",
+ [(store (not (loadi8 addr:$dst)), addr:$dst)]>;
+ def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst), "not{w}\t$dst",
+ [(store (not (loadi16 addr:$dst)), addr:$dst)]>, OpSize;
+ def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst), "not{l}\t$dst",
+ [(store (not (loadi32 addr:$dst)), addr:$dst)]>;
+}
+} // CodeSize
+
+// TODO: inc/dec is slow for P4, but fast for Pentium-M.
+let Defs = [EFLAGS] in {
+let CodeSize = 2 in
+def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src), "inc{b}\t$dst",
+ [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src))]>;
+
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA.
+def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src),
+ "inc{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src))]>,
+ OpSize, Requires<[In32BitMode]>;
+def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src),
+ "inc{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src))]>,
+ Requires<[In32BitMode]>;
+}
+let isTwoAddress = 0, CodeSize = 2 in {
+ def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
+ [(store (add (loadi8 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>;
+ def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize, Requires<[In32BitMode]>;
+ def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+ (implicit EFLAGS)]>,
+ Requires<[In32BitMode]>;
+}
+
+let CodeSize = 2 in
+def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src), "dec{b}\t$dst",
+ [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src))]>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA.
+def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src),
+ "dec{w}\t$dst",
+ [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src))]>,
+ OpSize, Requires<[In32BitMode]>;
+def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src),
+ "dec{l}\t$dst",
+ [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src))]>,
+ Requires<[In32BitMode]>;
+}
+
+let isTwoAddress = 0, CodeSize = 2 in {
+ def DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
+ [(store (add (loadi8 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>;
+ def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+ [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize, Requires<[In32BitMode]>;
+ def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+ [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+ (implicit EFLAGS)]>,
+ Requires<[In32BitMode]>;
+}
+} // Defs = [EFLAGS]
+
+// Logical operators...
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in { // X = AND Y, Z --> X = AND Z, Y
+def AND8rr : I<0x20, MRMDestReg,
+ (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
+ "and{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, EFLAGS, (X86and_flag GR8:$src1, GR8:$src2))]>;
+def AND16rr : I<0x21, MRMDestReg,
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "and{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1,
+ GR16:$src2))]>, OpSize;
+def AND32rr : I<0x21, MRMDestReg,
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "and{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1,
+ GR32:$src2))]>;
+}
+
+// AND instructions with the destination register in REG and the source register
+// in R/M. Included for the disassembler.
+let isCodeGenOnly = 1 in {
+def AND8rr_REV : I<0x22, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+ "and{b}\t{$src2, $dst|$dst, $src2}", []>;
+def AND16rr_REV : I<0x23, MRMSrcReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "and{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize;
+def AND32rr_REV : I<0x23, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "and{l}\t{$src2, $dst|$dst, $src2}", []>;
+}
+
+def AND8rm : I<0x22, MRMSrcMem,
+ (outs GR8 :$dst), (ins GR8 :$src1, i8mem :$src2),
+ "and{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, EFLAGS, (X86and_flag GR8:$src1,
+ (loadi8 addr:$src2)))]>;
+def AND16rm : I<0x23, MRMSrcMem,
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "and{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1,
+ (loadi16 addr:$src2)))]>,
+ OpSize;
+def AND32rm : I<0x23, MRMSrcMem,
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "and{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1,
+ (loadi32 addr:$src2)))]>;
+
+def AND8ri : Ii8<0x80, MRM4r,
+ (outs GR8 :$dst), (ins GR8 :$src1, i8imm :$src2),
+ "and{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, EFLAGS, (X86and_flag GR8:$src1,
+ imm:$src2))]>;
+def AND16ri : Ii16<0x81, MRM4r,
+ (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "and{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1,
+ imm:$src2))]>, OpSize;
+def AND32ri : Ii32<0x81, MRM4r,
+ (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+ "and{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1,
+ imm:$src2))]>;
+def AND16ri8 : Ii8<0x83, MRM4r,
+ (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+ "and{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS, (X86and_flag GR16:$src1,
+ i16immSExt8:$src2))]>,
+ OpSize;
+def AND32ri8 : Ii8<0x83, MRM4r,
+ (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+ "and{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS, (X86and_flag GR32:$src1,
+ i32immSExt8:$src2))]>;
+
+let isTwoAddress = 0 in {
+ def AND8mr : I<0x20, MRMDestMem,
+ (outs), (ins i8mem :$dst, GR8 :$src),
+ "and{b}\t{$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), GR8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def AND16mr : I<0x21, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src),
+ "and{w}\t{$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), GR16:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def AND32mr : I<0x21, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src),
+ "and{l}\t{$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), GR32:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def AND8mi : Ii8<0x80, MRM4m,
+ (outs), (ins i8mem :$dst, i8imm :$src),
+ "and{b}\t{$src, $dst|$dst, $src}",
+ [(store (and (loadi8 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def AND16mi : Ii16<0x81, MRM4m,
+ (outs), (ins i16mem:$dst, i16imm:$src),
+ "and{w}\t{$src, $dst|$dst, $src}",
+ [(store (and (loadi16 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def AND32mi : Ii32<0x81, MRM4m,
+ (outs), (ins i32mem:$dst, i32imm:$src),
+ "and{l}\t{$src, $dst|$dst, $src}",
+ [(store (and (loadi32 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def AND16mi8 : Ii8<0x83, MRM4m,
+ (outs), (ins i16mem:$dst, i16i8imm :$src),
+ "and{w}\t{$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), i16immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def AND32mi8 : Ii8<0x83, MRM4m,
+ (outs), (ins i32mem:$dst, i32i8imm :$src),
+ "and{l}\t{$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), i32immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+
+ def AND8i8 : Ii8<0x24, RawFrm, (outs), (ins i8imm:$src),
+ "and{b}\t{$src, %al|%al, $src}", []>;
+ def AND16i16 : Ii16<0x25, RawFrm, (outs), (ins i16imm:$src),
+ "and{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+ def AND32i32 : Ii32<0x25, RawFrm, (outs), (ins i32imm:$src),
+ "and{l}\t{$src, %eax|%eax, $src}", []>;
+
+}
+
+
+let isCommutable = 1 in { // X = OR Y, Z --> X = OR Z, Y
+def OR8rr : I<0x08, MRMDestReg, (outs GR8 :$dst),
+ (ins GR8 :$src1, GR8 :$src2),
+ "or{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, EFLAGS, (X86or_flag GR8:$src1, GR8:$src2))]>;
+def OR16rr : I<0x09, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "or{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1,GR16:$src2))]>,
+ OpSize;
+def OR32rr : I<0x09, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "or{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1,GR32:$src2))]>;
+}
+
+// OR instructions with the destination register in REG and the source register
+// in R/M. Included for the disassembler.
+let isCodeGenOnly = 1 in {
+def OR8rr_REV : I<0x0A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+ "or{b}\t{$src2, $dst|$dst, $src2}", []>;
+def OR16rr_REV : I<0x0B, MRMSrcReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "or{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize;
+def OR32rr_REV : I<0x0B, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "or{l}\t{$src2, $dst|$dst, $src2}", []>;
+}
+
+def OR8rm : I<0x0A, MRMSrcMem, (outs GR8 :$dst),
+ (ins GR8 :$src1, i8mem :$src2),
+ "or{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, EFLAGS, (X86or_flag GR8:$src1,
+ (load addr:$src2)))]>;
+def OR16rm : I<0x0B, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$src1, i16mem:$src2),
+ "or{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1,
+ (load addr:$src2)))]>,
+ OpSize;
+def OR32rm : I<0x0B, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "or{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1,
+ (load addr:$src2)))]>;
+
+def OR8ri : Ii8 <0x80, MRM1r, (outs GR8 :$dst),
+ (ins GR8 :$src1, i8imm:$src2),
+ "or{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst,EFLAGS, (X86or_flag GR8:$src1, imm:$src2))]>;
+def OR16ri : Ii16<0x81, MRM1r, (outs GR16:$dst),
+ (ins GR16:$src1, i16imm:$src2),
+ "or{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1,
+ imm:$src2))]>, OpSize;
+def OR32ri : Ii32<0x81, MRM1r, (outs GR32:$dst),
+ (ins GR32:$src1, i32imm:$src2),
+ "or{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1,
+ imm:$src2))]>;
+
+def OR16ri8 : Ii8<0x83, MRM1r, (outs GR16:$dst),
+ (ins GR16:$src1, i16i8imm:$src2),
+ "or{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS, (X86or_flag GR16:$src1,
+ i16immSExt8:$src2))]>, OpSize;
+def OR32ri8 : Ii8<0x83, MRM1r, (outs GR32:$dst),
+ (ins GR32:$src1, i32i8imm:$src2),
+ "or{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS, (X86or_flag GR32:$src1,
+ i32immSExt8:$src2))]>;
+let isTwoAddress = 0 in {
+ def OR8mr : I<0x08, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+ "or{b}\t{$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), GR8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def OR16mr : I<0x09, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "or{w}\t{$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), GR16:$src), addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def OR32mr : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "or{l}\t{$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), GR32:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def OR8mi : Ii8<0x80, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "or{b}\t{$src, $dst|$dst, $src}",
+ [(store (or (loadi8 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def OR16mi : Ii16<0x81, MRM1m, (outs), (ins i16mem:$dst, i16imm:$src),
+ "or{w}\t{$src, $dst|$dst, $src}",
+ [(store (or (loadi16 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def OR32mi : Ii32<0x81, MRM1m, (outs), (ins i32mem:$dst, i32imm:$src),
+ "or{l}\t{$src, $dst|$dst, $src}",
+ [(store (or (loadi32 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def OR16mi8 : Ii8<0x83, MRM1m, (outs), (ins i16mem:$dst, i16i8imm:$src),
+ "or{w}\t{$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), i16immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def OR32mi8 : Ii8<0x83, MRM1m, (outs), (ins i32mem:$dst, i32i8imm:$src),
+ "or{l}\t{$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), i32immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+
+ def OR8i8 : Ii8 <0x0C, RawFrm, (outs), (ins i8imm:$src),
+ "or{b}\t{$src, %al|%al, $src}", []>;
+ def OR16i16 : Ii16 <0x0D, RawFrm, (outs), (ins i16imm:$src),
+ "or{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+ def OR32i32 : Ii32 <0x0D, RawFrm, (outs), (ins i32imm:$src),
+ "or{l}\t{$src, %eax|%eax, $src}", []>;
+} // isTwoAddress = 0
+
+
+let isCommutable = 1 in { // X = XOR Y, Z --> X = XOR Z, Y
+ def XOR8rr : I<0x30, MRMDestReg,
+ (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
+ "xor{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, EFLAGS, (X86xor_flag GR8:$src1,
+ GR8:$src2))]>;
+ def XOR16rr : I<0x31, MRMDestReg,
+ (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+ "xor{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1,
+ GR16:$src2))]>, OpSize;
+ def XOR32rr : I<0x31, MRMDestReg,
+ (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+ "xor{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1,
+ GR32:$src2))]>;
+} // isCommutable = 1
+
+// XOR instructions with the destination register in REG and the source register
+// in R/M. Included for the disassembler.
+let isCodeGenOnly = 1 in {
+def XOR8rr_REV : I<0x32, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+ "xor{b}\t{$src2, $dst|$dst, $src2}", []>;
+def XOR16rr_REV : I<0x33, MRMSrcReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "xor{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize;
+def XOR32rr_REV : I<0x33, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "xor{l}\t{$src2, $dst|$dst, $src2}", []>;
+}
+
+def XOR8rm : I<0x32, MRMSrcMem,
+ (outs GR8 :$dst), (ins GR8:$src1, i8mem :$src2),
+ "xor{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, EFLAGS, (X86xor_flag GR8:$src1,
+ (load addr:$src2)))]>;
+def XOR16rm : I<0x33, MRMSrcMem,
+ (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+ "xor{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1,
+ (load addr:$src2)))]>,
+ OpSize;
+def XOR32rm : I<0x33, MRMSrcMem,
+ (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+ "xor{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1,
+ (load addr:$src2)))]>;
+
+def XOR8ri : Ii8<0x80, MRM6r,
+ (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+ "xor{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, EFLAGS, (X86xor_flag GR8:$src1, imm:$src2))]>;
+def XOR16ri : Ii16<0x81, MRM6r,
+ (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "xor{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1,
+ imm:$src2))]>, OpSize;
+def XOR32ri : Ii32<0x81, MRM6r,
+ (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+ "xor{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1,
+ imm:$src2))]>;
+def XOR16ri8 : Ii8<0x83, MRM6r,
+ (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+ "xor{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS, (X86xor_flag GR16:$src1,
+ i16immSExt8:$src2))]>,
+ OpSize;
+def XOR32ri8 : Ii8<0x83, MRM6r,
+ (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+ "xor{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS, (X86xor_flag GR32:$src1,
+ i32immSExt8:$src2))]>;
+
+let isTwoAddress = 0 in {
+ def XOR8mr : I<0x30, MRMDestMem,
+ (outs), (ins i8mem :$dst, GR8 :$src),
+ "xor{b}\t{$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), GR8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def XOR16mr : I<0x31, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src),
+ "xor{w}\t{$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), GR16:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def XOR32mr : I<0x31, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src),
+ "xor{l}\t{$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), GR32:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def XOR8mi : Ii8<0x80, MRM6m,
+ (outs), (ins i8mem :$dst, i8imm :$src),
+ "xor{b}\t{$src, $dst|$dst, $src}",
+ [(store (xor (loadi8 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def XOR16mi : Ii16<0x81, MRM6m,
+ (outs), (ins i16mem:$dst, i16imm:$src),
+ "xor{w}\t{$src, $dst|$dst, $src}",
+ [(store (xor (loadi16 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def XOR32mi : Ii32<0x81, MRM6m,
+ (outs), (ins i32mem:$dst, i32imm:$src),
+ "xor{l}\t{$src, $dst|$dst, $src}",
+ [(store (xor (loadi32 addr:$dst), imm:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+ def XOR16mi8 : Ii8<0x83, MRM6m,
+ (outs), (ins i16mem:$dst, i16i8imm :$src),
+ "xor{w}\t{$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), i16immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>,
+ OpSize;
+ def XOR32mi8 : Ii8<0x83, MRM6m,
+ (outs), (ins i32mem:$dst, i32i8imm :$src),
+ "xor{l}\t{$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), i32immSExt8:$src), addr:$dst),
+ (implicit EFLAGS)]>;
+
+ def XOR8i8 : Ii8 <0x34, RawFrm, (outs), (ins i8imm:$src),
+ "xor{b}\t{$src, %al|%al, $src}", []>;
+ def XOR16i16 : Ii16<0x35, RawFrm, (outs), (ins i16imm:$src),
+ "xor{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+ def XOR32i32 : Ii32<0x35, RawFrm, (outs), (ins i32imm:$src),
+ "xor{l}\t{$src, %eax|%eax, $src}", []>;
+} // isTwoAddress = 0
+} // Defs = [EFLAGS]
+
+// Shift instructions
+let Defs = [EFLAGS] in {
+let Uses = [CL] in {
+def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src),
+ "shl{b}\t{%cl, $dst|$dst, CL}",
+ [(set GR8:$dst, (shl GR8:$src, CL))]>;
+def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src),
+ "shl{w}\t{%cl, $dst|$dst, CL}",
+ [(set GR16:$dst, (shl GR16:$src, CL))]>, OpSize;
+def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src),
+ "shl{l}\t{%cl, $dst|$dst, CL}",
+ [(set GR32:$dst, (shl GR32:$src, CL))]>;
+} // Uses = [CL]
+
+def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+ "shl{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>;
+let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
+def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+ "shl{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+ "shl{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>;
+
+// NOTE: We don't include patterns for shifts of a register by one, because
+// 'add reg,reg' is cheaper.
+
+def SHL8r1 : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
+ "shl{b}\t$dst", []>;
+def SHL16r1 : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+ "shl{w}\t$dst", []>, OpSize;
+def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+ "shl{l}\t$dst", []>;
+
+} // isConvertibleToThreeAddress = 1
+
+let isTwoAddress = 0 in {
+ let Uses = [CL] in {
+ def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
+ "shl{b}\t{%cl, $dst|$dst, CL}",
+ [(store (shl (loadi8 addr:$dst), CL), addr:$dst)]>;
+ def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
+ "shl{w}\t{%cl, $dst|$dst, CL}",
+ [(store (shl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+ def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
+ "shl{l}\t{%cl, $dst|$dst, CL}",
+ [(store (shl (loadi32 addr:$dst), CL), addr:$dst)]>;
+ }
+ def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "shl{b}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src),
+ "shl{w}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize;
+ def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src),
+ "shl{l}\t{$src, $dst|$dst, $src}",
+ [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+ // Shift by 1
+ def SHL8m1 : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
+ "shl{b}\t$dst",
+ [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ def SHL16m1 : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
+ "shl{w}\t$dst",
+ [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize;
+ def SHL32m1 : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
+ "shl{l}\t$dst",
+ [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+let Uses = [CL] in {
+def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src),
+ "shr{b}\t{%cl, $dst|$dst, CL}",
+ [(set GR8:$dst, (srl GR8:$src, CL))]>;
+def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src),
+ "shr{w}\t{%cl, $dst|$dst, CL}",
+ [(set GR16:$dst, (srl GR16:$src, CL))]>, OpSize;
+def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src),
+ "shr{l}\t{%cl, $dst|$dst, CL}",
+ [(set GR32:$dst, (srl GR32:$src, CL))]>;
+}
+
+def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+ "shr{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))]>;
+def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+ "shr{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+ "shr{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))]>;
+
+// Shift by 1
+def SHR8r1 : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
+ "shr{b}\t$dst",
+ [(set GR8:$dst, (srl GR8:$src1, (i8 1)))]>;
+def SHR16r1 : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+ "shr{w}\t$dst",
+ [(set GR16:$dst, (srl GR16:$src1, (i8 1)))]>, OpSize;
+def SHR32r1 : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+ "shr{l}\t$dst",
+ [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+ let Uses = [CL] in {
+ def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
+ "shr{b}\t{%cl, $dst|$dst, CL}",
+ [(store (srl (loadi8 addr:$dst), CL), addr:$dst)]>;
+ def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
+ "shr{w}\t{%cl, $dst|$dst, CL}",
+ [(store (srl (loadi16 addr:$dst), CL), addr:$dst)]>,
+ OpSize;
+ def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
+ "shr{l}\t{%cl, $dst|$dst, CL}",
+ [(store (srl (loadi32 addr:$dst), CL), addr:$dst)]>;
+ }
+ def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "shr{b}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src),
+ "shr{w}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize;
+ def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src),
+ "shr{l}\t{$src, $dst|$dst, $src}",
+ [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+ // Shift by 1
+ def SHR8m1 : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
+ "shr{b}\t$dst",
+ [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ def SHR16m1 : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
+ "shr{w}\t$dst",
+ [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,OpSize;
+ def SHR32m1 : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
+ "shr{l}\t$dst",
+ [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+let Uses = [CL] in {
+def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src),
+ "sar{b}\t{%cl, $dst|$dst, CL}",
+ [(set GR8:$dst, (sra GR8:$src, CL))]>;
+def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src),
+ "sar{w}\t{%cl, $dst|$dst, CL}",
+ [(set GR16:$dst, (sra GR16:$src, CL))]>, OpSize;
+def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src),
+ "sar{l}\t{%cl, $dst|$dst, CL}",
+ [(set GR32:$dst, (sra GR32:$src, CL))]>;
+}
+
+def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+ "sar{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))]>;
+def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+ "sar{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))]>,
+ OpSize;
+def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+ "sar{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))]>;
+
+// Shift by 1
+def SAR8r1 : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "sar{b}\t$dst",
+ [(set GR8:$dst, (sra GR8:$src1, (i8 1)))]>;
+def SAR16r1 : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+ "sar{w}\t$dst",
+ [(set GR16:$dst, (sra GR16:$src1, (i8 1)))]>, OpSize;
+def SAR32r1 : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+ "sar{l}\t$dst",
+ [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+ let Uses = [CL] in {
+ def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
+ "sar{b}\t{%cl, $dst|$dst, CL}",
+ [(store (sra (loadi8 addr:$dst), CL), addr:$dst)]>;
+ def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
+ "sar{w}\t{%cl, $dst|$dst, CL}",
+ [(store (sra (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+ def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst),
+ "sar{l}\t{%cl, $dst|$dst, CL}",
+ [(store (sra (loadi32 addr:$dst), CL), addr:$dst)]>;
+ }
+ def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "sar{b}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src),
+ "sar{w}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize;
+ def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src),
+ "sar{l}\t{$src, $dst|$dst, $src}",
+ [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+ // Shift by 1
+ def SAR8m1 : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
+ "sar{b}\t$dst",
+ [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ def SAR16m1 : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
+ "sar{w}\t$dst",
+ [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize;
+ def SAR32m1 : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
+ "sar{l}\t$dst",
+ [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+// Rotate instructions
+
+def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src),
+ "rcl{b}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src),
+ "rcl{b}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src, i8imm:$cnt),
+ "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+
+def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src),
+ "rcl{w}\t{1, $dst|$dst, 1}", []>, OpSize;
+let Uses = [CL] in {
+def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src),
+ "rcl{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+}
+def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src, i8imm:$cnt),
+ "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+
+def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src),
+ "rcl{l}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src),
+ "rcl{l}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src, i8imm:$cnt),
+ "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+
+def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src),
+ "rcr{b}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src),
+ "rcr{b}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src, i8imm:$cnt),
+ "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+
+def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src),
+ "rcr{w}\t{1, $dst|$dst, 1}", []>, OpSize;
+let Uses = [CL] in {
+def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src),
+ "rcr{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+}
+def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src, i8imm:$cnt),
+ "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+
+def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src),
+ "rcr{l}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src),
+ "rcr{l}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src, i8imm:$cnt),
+ "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+
+let isTwoAddress = 0 in {
+def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
+ "rcl{b}\t{1, $dst|$dst, 1}", []>;
+def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt),
+ "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst),
+ "rcl{w}\t{1, $dst|$dst, 1}", []>, OpSize;
+def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, i8imm:$cnt),
+ "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
+ "rcl{l}\t{1, $dst|$dst, 1}", []>;
+def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, i8imm:$cnt),
+ "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
+ "rcr{b}\t{1, $dst|$dst, 1}", []>;
+def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, i8imm:$cnt),
+ "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst),
+ "rcr{w}\t{1, $dst|$dst, 1}", []>, OpSize;
+def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, i8imm:$cnt),
+ "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
+ "rcr{l}\t{1, $dst|$dst, 1}", []>;
+def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, i8imm:$cnt),
+ "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+
+let Uses = [CL] in {
+def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
+ "rcl{b}\t{%cl, $dst|$dst, CL}", []>;
+def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
+ "rcl{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
+ "rcl{l}\t{%cl, $dst|$dst, CL}", []>;
+def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
+ "rcr{b}\t{%cl, $dst|$dst, CL}", []>;
+def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
+ "rcr{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
+ "rcr{l}\t{%cl, $dst|$dst, CL}", []>;
+}
+}
+
+// FIXME: provide shorter instructions when imm8 == 1
+let Uses = [CL] in {
+def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src),
+ "rol{b}\t{%cl, $dst|$dst, CL}",
+ [(set GR8:$dst, (rotl GR8:$src, CL))]>;
+def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src),
+ "rol{w}\t{%cl, $dst|$dst, CL}",
+ [(set GR16:$dst, (rotl GR16:$src, CL))]>, OpSize;
+def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src),
+ "rol{l}\t{%cl, $dst|$dst, CL}",
+ [(set GR32:$dst, (rotl GR32:$src, CL))]>;
+}
+
+def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+ "rol{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>;
+def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+ "rol{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>,
+ OpSize;
+def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+ "rol{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "rol{b}\t$dst",
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>;
+def ROL16r1 : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+ "rol{w}\t$dst",
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize;
+def ROL32r1 : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+ "rol{l}\t$dst",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+ let Uses = [CL] in {
+ def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
+ "rol{b}\t{%cl, $dst|$dst, CL}",
+ [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)]>;
+ def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
+ "rol{w}\t{%cl, $dst|$dst, CL}",
+ [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+ def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
+ "rol{l}\t{%cl, $dst|$dst, CL}",
+ [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)]>;
+ }
+ def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "rol{b}\t{$src, $dst|$dst, $src}",
+ [(store (rotl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src),
+ "rol{w}\t{$src, $dst|$dst, $src}",
+ [(store (rotl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize;
+ def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src),
+ "rol{l}\t{$src, $dst|$dst, $src}",
+ [(store (rotl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+ // Rotate by 1
+ def ROL8m1 : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
+ "rol{b}\t$dst",
+ [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ def ROL16m1 : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
+ "rol{w}\t$dst",
+ [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize;
+ def ROL32m1 : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
+ "rol{l}\t$dst",
+ [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+let Uses = [CL] in {
+def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src),
+ "ror{b}\t{%cl, $dst|$dst, CL}",
+ [(set GR8:$dst, (rotr GR8:$src, CL))]>;
+def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src),
+ "ror{w}\t{%cl, $dst|$dst, CL}",
+ [(set GR16:$dst, (rotr GR16:$src, CL))]>, OpSize;
+def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src),
+ "ror{l}\t{%cl, $dst|$dst, CL}",
+ [(set GR32:$dst, (rotr GR32:$src, CL))]>;
+}
+
+def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+ "ror{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))]>;
+def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+ "ror{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))]>,
+ OpSize;
+def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+ "ror{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+ "ror{b}\t$dst",
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>;
+def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+ "ror{w}\t$dst",
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize;
+def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+ "ror{l}\t$dst",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+ let Uses = [CL] in {
+ def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
+ "ror{b}\t{%cl, $dst|$dst, CL}",
+ [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)]>;
+ def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
+ "ror{w}\t{%cl, $dst|$dst, CL}",
+ [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+ def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst),
+ "ror{l}\t{%cl, $dst|$dst, CL}",
+ [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)]>;
+ }
+ def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src),
+ "ror{b}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src),
+ "ror{w}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize;
+ def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src),
+ "ror{l}\t{$src, $dst|$dst, $src}",
+ [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+ // Rotate by 1
+ def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
+ "ror{b}\t$dst",
+ [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
+ "ror{w}\t$dst",
+ [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize;
+ def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
+ "ror{l}\t$dst",
+ [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+
+
+// Double shift instructions (generalizations of rotate)
+let Uses = [CL] in {
+def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>, TB;
+def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>, TB;
+def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>,
+ TB, OpSize;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>,
+ TB, OpSize;
+}
+
+let isCommutable = 1 in { // These instructions commute to each other.
+def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
+ (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+ "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+ (i8 imm:$src3)))]>,
+ TB;
+def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
+ (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+ "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+ (i8 imm:$src3)))]>,
+ TB;
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+ (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+ "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+ (i8 imm:$src3)))]>,
+ TB, OpSize;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+ (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+ "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+ (i8 imm:$src3)))]>,
+ TB, OpSize;
+}
+
+let isTwoAddress = 0 in {
+ let Uses = [CL] in {
+ def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+ addr:$dst)]>, TB;
+ def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
+ addr:$dst)]>, TB;
+ }
+ def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+ "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB;
+ def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
+ (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+ "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB;
+
+ let Uses = [CL] in {
+ def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
+ addr:$dst)]>, TB, OpSize;
+ def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+ [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
+ addr:$dst)]>, TB, OpSize;
+ }
+ def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+ "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB, OpSize;
+ def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
+ (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+ "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB, OpSize;
+}
+} // Defs = [EFLAGS]
+
+
+// Arithmetic.
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in { // X = ADD Y, Z --> X = ADD Z, Y
+// Register-Register Addition
+def ADD8rr : I<0x00, MRMDestReg, (outs GR8 :$dst),
+ (ins GR8 :$src1, GR8 :$src2),
+ "add{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, EFLAGS, (X86add_flag GR8:$src1, GR8:$src2))]>;
+
+let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
+// Register-Register Addition
+def ADD16rr : I<0x01, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "add{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS, (X86add_flag GR16:$src1,
+ GR16:$src2))]>, OpSize;
+def ADD32rr : I<0x01, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS, (X86add_flag GR32:$src1,
+ GR32:$src2))]>;
+} // end isConvertibleToThreeAddress
+} // end isCommutable
+
+// These are alternate spellings for use by the disassembler, we mark them as
+// code gen only to ensure they aren't matched by the assembler.
+let isCodeGenOnly = 1 in {
+ def ADD8rr_alt: I<0x02, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+ "add{b}\t{$src2, $dst|$dst, $src2}", []>;
+ def ADD16rr_alt: I<0x03, MRMSrcReg,(outs GR16:$dst),(ins GR16:$src1, GR16:$src2),
+ "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize;
+ def ADD32rr_alt: I<0x03, MRMSrcReg,(outs GR32:$dst),(ins GR32:$src1, GR32:$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}", []>;
+}
+
+// Register-Memory Addition
+def ADD8rm : I<0x02, MRMSrcMem, (outs GR8 :$dst),
+ (ins GR8 :$src1, i8mem :$src2),
+ "add{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, EFLAGS, (X86add_flag GR8:$src1,
+ (load addr:$src2)))]>;
+def ADD16rm : I<0x03, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$src1, i16mem:$src2),
+ "add{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS, (X86add_flag GR16:$src1,
+ (load addr:$src2)))]>, OpSize;
+def ADD32rm : I<0x03, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS, (X86add_flag GR32:$src1,
+ (load addr:$src2)))]>;
+
+// Register-Integer Addition
+def ADD8ri : Ii8<0x80, MRM0r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+ "add{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, EFLAGS,
+ (X86add_flag GR8:$src1, imm:$src2))]>;
+
+let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
+// Register-Integer Addition
+def ADD16ri : Ii16<0x81, MRM0r, (outs GR16:$dst),
+ (ins GR16:$src1, i16imm:$src2),
+ "add{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86add_flag GR16:$src1, imm:$src2))]>, OpSize;
+def ADD32ri : Ii32<0x81, MRM0r, (outs GR32:$dst),
+ (ins GR32:$src1, i32imm:$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86add_flag GR32:$src1, imm:$src2))]>;
+def ADD16ri8 : Ii8<0x83, MRM0r, (outs GR16:$dst),
+ (ins GR16:$src1, i16i8imm:$src2),
+ "add{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86add_flag GR16:$src1, i16immSExt8:$src2))]>, OpSize;
+def ADD32ri8 : Ii8<0x83, MRM0r, (outs GR32:$dst),
+ (ins GR32:$src1, i32i8imm:$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86add_flag GR32:$src1, i32immSExt8:$src2))]>;
+}
+
+let isTwoAddress = 0 in {
+ // Memory-Register Addition
+ def ADD8mr : I<0x00, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+ "add{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), GR8:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+ def ADD16mr : I<0x01, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "add{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), GR16:$src2), addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def ADD32mr : I<0x01, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), GR32:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+ def ADD8mi : Ii8<0x80, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src2),
+ "add{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (loadi8 addr:$dst), imm:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+ def ADD16mi : Ii16<0x81, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src2),
+ "add{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (loadi16 addr:$dst), imm:$src2), addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def ADD32mi : Ii32<0x81, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (loadi32 addr:$dst), imm:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+ def ADD16mi8 : Ii8<0x83, MRM0m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+ "add{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), i16immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def ADD32mi8 : Ii8<0x83, MRM0m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+ "add{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), i32immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)]>;
+
+ // addition to rAX
+ def ADD8i8 : Ii8<0x04, RawFrm, (outs), (ins i8imm:$src),
+ "add{b}\t{$src, %al|%al, $src}", []>;
+ def ADD16i16 : Ii16<0x05, RawFrm, (outs), (ins i16imm:$src),
+ "add{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+ def ADD32i32 : Ii32<0x05, RawFrm, (outs), (ins i32imm:$src),
+ "add{l}\t{$src, %eax|%eax, $src}", []>;
+}
+
+let Uses = [EFLAGS] in {
+let isCommutable = 1 in { // X = ADC Y, Z --> X = ADC Z, Y
+def ADC8rr : I<0x10, MRMDestReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+ "adc{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (adde GR8:$src1, GR8:$src2))]>;
+def ADC16rr : I<0x11, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "adc{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (adde GR16:$src1, GR16:$src2))]>, OpSize;
+def ADC32rr : I<0x11, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "adc{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (adde GR32:$src1, GR32:$src2))]>;
+}
+
+let isCodeGenOnly = 1 in {
+def ADC8rr_REV : I<0x12, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+ "adc{b}\t{$src2, $dst|$dst, $src2}", []>;
+def ADC16rr_REV : I<0x13, MRMSrcReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "adc{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize;
+def ADC32rr_REV : I<0x13, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "adc{l}\t{$src2, $dst|$dst, $src2}", []>;
+}
+
+def ADC8rm : I<0x12, MRMSrcMem , (outs GR8:$dst),
+ (ins GR8:$src1, i8mem:$src2),
+ "adc{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (adde GR8:$src1, (load addr:$src2)))]>;
+def ADC16rm : I<0x13, MRMSrcMem , (outs GR16:$dst),
+ (ins GR16:$src1, i16mem:$src2),
+ "adc{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (adde GR16:$src1, (load addr:$src2)))]>,
+ OpSize;
+def ADC32rm : I<0x13, MRMSrcMem , (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "adc{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (adde GR32:$src1, (load addr:$src2)))]>;
+def ADC8ri : Ii8<0x80, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+ "adc{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (adde GR8:$src1, imm:$src2))]>;
+def ADC16ri : Ii16<0x81, MRM2r, (outs GR16:$dst),
+ (ins GR16:$src1, i16imm:$src2),
+ "adc{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (adde GR16:$src1, imm:$src2))]>, OpSize;
+def ADC16ri8 : Ii8<0x83, MRM2r, (outs GR16:$dst),
+ (ins GR16:$src1, i16i8imm:$src2),
+ "adc{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (adde GR16:$src1, i16immSExt8:$src2))]>,
+ OpSize;
+def ADC32ri : Ii32<0x81, MRM2r, (outs GR32:$dst),
+ (ins GR32:$src1, i32imm:$src2),
+ "adc{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (adde GR32:$src1, imm:$src2))]>;
+def ADC32ri8 : Ii8<0x83, MRM2r, (outs GR32:$dst),
+ (ins GR32:$src1, i32i8imm:$src2),
+ "adc{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (adde GR32:$src1, i32immSExt8:$src2))]>;
+
+let isTwoAddress = 0 in {
+ def ADC8mr : I<0x10, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+ "adc{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), GR8:$src2), addr:$dst)]>;
+ def ADC16mr : I<0x11, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "adc{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), GR16:$src2), addr:$dst)]>,
+ OpSize;
+ def ADC32mr : I<0x11, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "adc{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), GR32:$src2), addr:$dst)]>;
+ def ADC8mi : Ii8<0x80, MRM2m, (outs), (ins i8mem:$dst, i8imm:$src2),
+ "adc{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+ def ADC16mi : Ii16<0x81, MRM2m, (outs), (ins i16mem:$dst, i16imm:$src2),
+ "adc{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (loadi16 addr:$dst), imm:$src2), addr:$dst)]>,
+ OpSize;
+ def ADC16mi8 : Ii8<0x83, MRM2m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+ "adc{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>,
+ OpSize;
+ def ADC32mi : Ii32<0x81, MRM2m, (outs), (ins i32mem:$dst, i32imm:$src2),
+ "adc{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+ def ADC32mi8 : Ii8<0x83, MRM2m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+ "adc{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+
+ def ADC8i8 : Ii8<0x14, RawFrm, (outs), (ins i8imm:$src),
+ "adc{b}\t{$src, %al|%al, $src}", []>;
+ def ADC16i16 : Ii16<0x15, RawFrm, (outs), (ins i16imm:$src),
+ "adc{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+ def ADC32i32 : Ii32<0x15, RawFrm, (outs), (ins i32imm:$src),
+ "adc{l}\t{$src, %eax|%eax, $src}", []>;
+}
+} // Uses = [EFLAGS]
+
+// Register-Register Subtraction
+def SUB8rr : I<0x28, MRMDestReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+ "sub{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, EFLAGS,
+ (X86sub_flag GR8:$src1, GR8:$src2))]>;
+def SUB16rr : I<0x29, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+ "sub{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86sub_flag GR16:$src1, GR16:$src2))]>, OpSize;
+def SUB32rr : I<0x29, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+ "sub{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86sub_flag GR32:$src1, GR32:$src2))]>;
+
+let isCodeGenOnly = 1 in {
+def SUB8rr_REV : I<0x2A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+ "sub{b}\t{$src2, $dst|$dst, $src2}", []>;
+def SUB16rr_REV : I<0x2B, MRMSrcReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize;
+def SUB32rr_REV : I<0x2B, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "sub{l}\t{$src2, $dst|$dst, $src2}", []>;
+}
+
+// Register-Memory Subtraction
+def SUB8rm : I<0x2A, MRMSrcMem, (outs GR8 :$dst),
+ (ins GR8 :$src1, i8mem :$src2),
+ "sub{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, EFLAGS,
+ (X86sub_flag GR8:$src1, (load addr:$src2)))]>;
+def SUB16rm : I<0x2B, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$src1, i16mem:$src2),
+ "sub{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86sub_flag GR16:$src1, (load addr:$src2)))]>, OpSize;
+def SUB32rm : I<0x2B, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "sub{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86sub_flag GR32:$src1, (load addr:$src2)))]>;
+
+// Register-Integer Subtraction
+def SUB8ri : Ii8 <0x80, MRM5r, (outs GR8:$dst),
+ (ins GR8:$src1, i8imm:$src2),
+ "sub{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, EFLAGS,
+ (X86sub_flag GR8:$src1, imm:$src2))]>;
+def SUB16ri : Ii16<0x81, MRM5r, (outs GR16:$dst),
+ (ins GR16:$src1, i16imm:$src2),
+ "sub{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86sub_flag GR16:$src1, imm:$src2))]>, OpSize;
+def SUB32ri : Ii32<0x81, MRM5r, (outs GR32:$dst),
+ (ins GR32:$src1, i32imm:$src2),
+ "sub{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86sub_flag GR32:$src1, imm:$src2))]>;
+def SUB16ri8 : Ii8<0x83, MRM5r, (outs GR16:$dst),
+ (ins GR16:$src1, i16i8imm:$src2),
+ "sub{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86sub_flag GR16:$src1, i16immSExt8:$src2))]>, OpSize;
+def SUB32ri8 : Ii8<0x83, MRM5r, (outs GR32:$dst),
+ (ins GR32:$src1, i32i8imm:$src2),
+ "sub{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86sub_flag GR32:$src1, i32immSExt8:$src2))]>;
+
+let isTwoAddress = 0 in {
+ // Memory-Register Subtraction
+ def SUB8mr : I<0x28, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src2),
+ "sub{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), GR8:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+ def SUB16mr : I<0x29, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "sub{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), GR16:$src2), addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def SUB32mr : I<0x29, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "sub{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), GR32:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+
+ // Memory-Integer Subtraction
+ def SUB8mi : Ii8<0x80, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src2),
+ "sub{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (loadi8 addr:$dst), imm:$src2), addr:$dst),
+ (implicit EFLAGS)]>;
+ def SUB16mi : Ii16<0x81, MRM5m, (outs), (ins i16mem:$dst, i16imm:$src2),
+ "sub{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (loadi16 addr:$dst), imm:$src2),addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def SUB32mi : Ii32<0x81, MRM5m, (outs), (ins i32mem:$dst, i32imm:$src2),
+ "sub{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (loadi32 addr:$dst), imm:$src2),addr:$dst),
+ (implicit EFLAGS)]>;
+ def SUB16mi8 : Ii8<0x83, MRM5m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+ "sub{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), i16immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)]>, OpSize;
+ def SUB32mi8 : Ii8<0x83, MRM5m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+ "sub{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), i32immSExt8:$src2),
+ addr:$dst),
+ (implicit EFLAGS)]>;
+
+ def SUB8i8 : Ii8<0x2C, RawFrm, (outs), (ins i8imm:$src),
+ "sub{b}\t{$src, %al|%al, $src}", []>;
+ def SUB16i16 : Ii16<0x2D, RawFrm, (outs), (ins i16imm:$src),
+ "sub{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+ def SUB32i32 : Ii32<0x2D, RawFrm, (outs), (ins i32imm:$src),
+ "sub{l}\t{$src, %eax|%eax, $src}", []>;
+}
+
+let Uses = [EFLAGS] in {
+def SBB8rr : I<0x18, MRMDestReg, (outs GR8:$dst),
+ (ins GR8:$src1, GR8:$src2),
+ "sbb{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sube GR8:$src1, GR8:$src2))]>;
+def SBB16rr : I<0x19, MRMDestReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "sbb{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sube GR16:$src1, GR16:$src2))]>, OpSize;
+def SBB32rr : I<0x19, MRMDestReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "sbb{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sube GR32:$src1, GR32:$src2))]>;
+
+let isTwoAddress = 0 in {
+ def SBB8mr : I<0x18, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+ "sbb{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), GR8:$src2), addr:$dst)]>;
+ def SBB16mr : I<0x19, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "sbb{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), GR16:$src2), addr:$dst)]>,
+ OpSize;
+ def SBB32mr : I<0x19, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "sbb{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), GR32:$src2), addr:$dst)]>;
+ def SBB8mi : Ii8<0x80, MRM3m, (outs), (ins i8mem:$dst, i8imm:$src2),
+ "sbb{b}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+ def SBB16mi : Ii16<0x81, MRM3m, (outs), (ins i16mem:$dst, i16imm:$src2),
+ "sbb{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (loadi16 addr:$dst), imm:$src2), addr:$dst)]>,
+ OpSize;
+ def SBB16mi8 : Ii8<0x83, MRM3m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+ "sbb{w}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>,
+ OpSize;
+ def SBB32mi : Ii32<0x81, MRM3m, (outs), (ins i32mem:$dst, i32imm:$src2),
+ "sbb{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+ def SBB32mi8 : Ii8<0x83, MRM3m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+ "sbb{l}\t{$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+
+ def SBB8i8 : Ii8<0x1C, RawFrm, (outs), (ins i8imm:$src),
+ "sbb{b}\t{$src, %al|%al, $src}", []>;
+ def SBB16i16 : Ii16<0x1D, RawFrm, (outs), (ins i16imm:$src),
+ "sbb{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+ def SBB32i32 : Ii32<0x1D, RawFrm, (outs), (ins i32imm:$src),
+ "sbb{l}\t{$src, %eax|%eax, $src}", []>;
+}
+
+let isCodeGenOnly = 1 in {
+def SBB8rr_REV : I<0x1A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+ "sbb{b}\t{$src2, $dst|$dst, $src2}", []>;
+def SBB16rr_REV : I<0x1B, MRMSrcReg, (outs GR16:$dst),
+ (ins GR16:$src1, GR16:$src2),
+ "sbb{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize;
+def SBB32rr_REV : I<0x1B, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "sbb{l}\t{$src2, $dst|$dst, $src2}", []>;
+}
+
+def SBB8rm : I<0x1A, MRMSrcMem, (outs GR8:$dst), (ins GR8:$src1, i8mem:$src2),
+ "sbb{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sube GR8:$src1, (load addr:$src2)))]>;
+def SBB16rm : I<0x1B, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$src1, i16mem:$src2),
+ "sbb{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sube GR16:$src1, (load addr:$src2)))]>,
+ OpSize;
+def SBB32rm : I<0x1B, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "sbb{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sube GR32:$src1, (load addr:$src2)))]>;
+def SBB8ri : Ii8<0x80, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+ "sbb{b}\t{$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sube GR8:$src1, imm:$src2))]>;
+def SBB16ri : Ii16<0x81, MRM3r, (outs GR16:$dst),
+ (ins GR16:$src1, i16imm:$src2),
+ "sbb{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sube GR16:$src1, imm:$src2))]>, OpSize;
+def SBB16ri8 : Ii8<0x83, MRM3r, (outs GR16:$dst),
+ (ins GR16:$src1, i16i8imm:$src2),
+ "sbb{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sube GR16:$src1, i16immSExt8:$src2))]>,
+ OpSize;
+def SBB32ri : Ii32<0x81, MRM3r, (outs GR32:$dst),
+ (ins GR32:$src1, i32imm:$src2),
+ "sbb{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sube GR32:$src1, imm:$src2))]>;
+def SBB32ri8 : Ii8<0x83, MRM3r, (outs GR32:$dst),
+ (ins GR32:$src1, i32i8imm:$src2),
+ "sbb{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sube GR32:$src1, i32immSExt8:$src2))]>;
+} // Uses = [EFLAGS]
+} // Defs = [EFLAGS]
+
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in { // X = IMUL Y, Z --> X = IMUL Z, Y
+// Register-Register Signed Integer Multiply
+def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+ "imul{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, GR16:$src2))]>, TB, OpSize;
+def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+ "imul{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, GR32:$src2))]>, TB;
+}
+
+// Register-Memory Signed Integer Multiply
+def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$src1, i16mem:$src2),
+ "imul{w}\t{$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, (load addr:$src2)))]>,
+ TB, OpSize;
+def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "imul{l}\t{$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, (load addr:$src2)))]>, TB;
+} // Defs = [EFLAGS]
+} // end Two Address instructions
+
+// Suprisingly enough, these are not two address instructions!
+let Defs = [EFLAGS] in {
+// Register-Integer Signed Integer Multiply
+def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16
+ (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, imm:$src2))]>, OpSize;
+def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32
+ (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, imm:$src2))]>;
+def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8
+ (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, i16immSExt8:$src2))]>,
+ OpSize;
+def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8
+ (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag GR32:$src1, i32immSExt8:$src2))]>;
+
+// Memory-Integer Signed Integer Multiply
+def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16
+ (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1), imm:$src2))]>,
+ OpSize;
+def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32
+ (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1), imm:$src2))]>;
+def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8
+ (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
+ "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i16immSExt8:$src2))]>, OpSize;
+def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8
+ (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
+ "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, EFLAGS,
+ (X86smul_flag (load addr:$src1),
+ i32immSExt8:$src2))]>;
+} // Defs = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+// Test instructions are just like AND, except they don't generate a result.
+//
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in { // TEST X, Y --> TEST Y, X
+def TEST8rr : I<0x84, MRMSrcReg, (outs), (ins GR8:$src1, GR8:$src2),
+ "test{b}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and_su GR8:$src1, GR8:$src2), 0))]>;
+def TEST16rr : I<0x85, MRMSrcReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "test{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and_su GR16:$src1, GR16:$src2),
+ 0))]>,
+ OpSize;
+def TEST32rr : I<0x85, MRMSrcReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "test{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and_su GR32:$src1, GR32:$src2),
+ 0))]>;
+}
+
+def TEST8i8 : Ii8<0xA8, RawFrm, (outs), (ins i8imm:$src),
+ "test{b}\t{$src, %al|%al, $src}", []>;
+def TEST16i16 : Ii16<0xA9, RawFrm, (outs), (ins i16imm:$src),
+ "test{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+def TEST32i32 : Ii32<0xA9, RawFrm, (outs), (ins i32imm:$src),
+ "test{l}\t{$src, %eax|%eax, $src}", []>;
+
+def TEST8rm : I<0x84, MRMSrcMem, (outs), (ins GR8 :$src1, i8mem :$src2),
+ "test{b}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and GR8:$src1, (loadi8 addr:$src2)),
+ 0))]>;
+def TEST16rm : I<0x85, MRMSrcMem, (outs), (ins GR16:$src1, i16mem:$src2),
+ "test{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and GR16:$src1,
+ (loadi16 addr:$src2)), 0))]>, OpSize;
+def TEST32rm : I<0x85, MRMSrcMem, (outs), (ins GR32:$src1, i32mem:$src2),
+ "test{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and GR32:$src1,
+ (loadi32 addr:$src2)), 0))]>;
+
+def TEST8ri : Ii8 <0xF6, MRM0r, // flags = GR8 & imm8
+ (outs), (ins GR8:$src1, i8imm:$src2),
+ "test{b}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and_su GR8:$src1, imm:$src2), 0))]>;
+def TEST16ri : Ii16<0xF7, MRM0r, // flags = GR16 & imm16
+ (outs), (ins GR16:$src1, i16imm:$src2),
+ "test{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and_su GR16:$src1, imm:$src2), 0))]>,
+ OpSize;
+def TEST32ri : Ii32<0xF7, MRM0r, // flags = GR32 & imm32
+ (outs), (ins GR32:$src1, i32imm:$src2),
+ "test{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and_su GR32:$src1, imm:$src2), 0))]>;
+
+def TEST8mi : Ii8 <0xF6, MRM0m, // flags = [mem8] & imm8
+ (outs), (ins i8mem:$src1, i8imm:$src2),
+ "test{b}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and (loadi8 addr:$src1), imm:$src2),
+ 0))]>;
+def TEST16mi : Ii16<0xF7, MRM0m, // flags = [mem16] & imm16
+ (outs), (ins i16mem:$src1, i16imm:$src2),
+ "test{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and (loadi16 addr:$src1), imm:$src2),
+ 0))]>, OpSize;
+def TEST32mi : Ii32<0xF7, MRM0m, // flags = [mem32] & imm32
+ (outs), (ins i32mem:$src1, i32imm:$src2),
+ "test{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (and (loadi32 addr:$src1), imm:$src2),
+ 0))]>;
+} // Defs = [EFLAGS]
+
+
+// Condition code ops, incl. set if equal/not equal/...
+let Defs = [EFLAGS], Uses = [AH], neverHasSideEffects = 1 in
+def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", []>; // flags = AH
+let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in
+def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>; // AH = flags
+
+let Uses = [EFLAGS] in {
+// Use sbb to materialize carry bit.
+let Defs = [EFLAGS], isCodeGenOnly = 1 in {
+// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
+// However, Pat<> can't replicate the destination reg into the inputs of the
+// result.
+// FIXME: Change these to have encoding Pseudo when X86MCCodeEmitter replaces
+// X86CodeEmitter.
+def SETB_C8r : I<0x18, MRMInitReg, (outs GR8:$dst), (ins), "",
+ [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C16r : I<0x19, MRMInitReg, (outs GR16:$dst), (ins), "",
+ [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>,
+ OpSize;
+def SETB_C32r : I<0x19, MRMInitReg, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+} // isCodeGenOnly
+
+def SETEr : I<0x94, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "sete\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_E, EFLAGS))]>,
+ TB; // GR8 = ==
+def SETEm : I<0x94, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "sete\t$dst",
+ [(store (X86setcc X86_COND_E, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = ==
+
+def SETNEr : I<0x95, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setne\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_NE, EFLAGS))]>,
+ TB; // GR8 = !=
+def SETNEm : I<0x95, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setne\t$dst",
+ [(store (X86setcc X86_COND_NE, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = !=
+
+def SETLr : I<0x9C, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setl\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_L, EFLAGS))]>,
+ TB; // GR8 = < signed
+def SETLm : I<0x9C, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setl\t$dst",
+ [(store (X86setcc X86_COND_L, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = < signed
+
+def SETGEr : I<0x9D, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setge\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_GE, EFLAGS))]>,
+ TB; // GR8 = >= signed
+def SETGEm : I<0x9D, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setge\t$dst",
+ [(store (X86setcc X86_COND_GE, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = >= signed
+
+def SETLEr : I<0x9E, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setle\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_LE, EFLAGS))]>,
+ TB; // GR8 = <= signed
+def SETLEm : I<0x9E, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setle\t$dst",
+ [(store (X86setcc X86_COND_LE, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = <= signed
+
+def SETGr : I<0x9F, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setg\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_G, EFLAGS))]>,
+ TB; // GR8 = > signed
+def SETGm : I<0x9F, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setg\t$dst",
+ [(store (X86setcc X86_COND_G, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = > signed
+
+def SETBr : I<0x92, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setb\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_B, EFLAGS))]>,
+ TB; // GR8 = < unsign
+def SETBm : I<0x92, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setb\t$dst",
+ [(store (X86setcc X86_COND_B, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = < unsign
+
+def SETAEr : I<0x93, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setae\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_AE, EFLAGS))]>,
+ TB; // GR8 = >= unsign
+def SETAEm : I<0x93, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setae\t$dst",
+ [(store (X86setcc X86_COND_AE, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = >= unsign
+
+def SETBEr : I<0x96, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setbe\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_BE, EFLAGS))]>,
+ TB; // GR8 = <= unsign
+def SETBEm : I<0x96, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setbe\t$dst",
+ [(store (X86setcc X86_COND_BE, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = <= unsign
+
+def SETAr : I<0x97, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "seta\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_A, EFLAGS))]>,
+ TB; // GR8 = > signed
+def SETAm : I<0x97, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "seta\t$dst",
+ [(store (X86setcc X86_COND_A, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = > signed
+
+def SETSr : I<0x98, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "sets\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_S, EFLAGS))]>,
+ TB; // GR8 = <sign bit>
+def SETSm : I<0x98, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "sets\t$dst",
+ [(store (X86setcc X86_COND_S, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = <sign bit>
+def SETNSr : I<0x99, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setns\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_NS, EFLAGS))]>,
+ TB; // GR8 = !<sign bit>
+def SETNSm : I<0x99, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setns\t$dst",
+ [(store (X86setcc X86_COND_NS, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = !<sign bit>
+
+def SETPr : I<0x9A, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setp\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_P, EFLAGS))]>,
+ TB; // GR8 = parity
+def SETPm : I<0x9A, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setp\t$dst",
+ [(store (X86setcc X86_COND_P, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = parity
+def SETNPr : I<0x9B, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setnp\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_NP, EFLAGS))]>,
+ TB; // GR8 = not parity
+def SETNPm : I<0x9B, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setnp\t$dst",
+ [(store (X86setcc X86_COND_NP, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = not parity
+
+def SETOr : I<0x90, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "seto\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_O, EFLAGS))]>,
+ TB; // GR8 = overflow
+def SETOm : I<0x90, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "seto\t$dst",
+ [(store (X86setcc X86_COND_O, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = overflow
+def SETNOr : I<0x91, MRM0r,
+ (outs GR8 :$dst), (ins),
+ "setno\t$dst",
+ [(set GR8:$dst, (X86setcc X86_COND_NO, EFLAGS))]>,
+ TB; // GR8 = not overflow
+def SETNOm : I<0x91, MRM0m,
+ (outs), (ins i8mem:$dst),
+ "setno\t$dst",
+ [(store (X86setcc X86_COND_NO, EFLAGS), addr:$dst)]>,
+ TB; // [mem8] = not overflow
+} // Uses = [EFLAGS]
+
+
+// Integer comparisons
+let Defs = [EFLAGS] in {
+def CMP8i8 : Ii8<0x3C, RawFrm, (outs), (ins i8imm:$src),
+ "cmp{b}\t{$src, %al|%al, $src}", []>;
+def CMP16i16 : Ii16<0x3D, RawFrm, (outs), (ins i16imm:$src),
+ "cmp{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+def CMP32i32 : Ii32<0x3D, RawFrm, (outs), (ins i32imm:$src),
+ "cmp{l}\t{$src, %eax|%eax, $src}", []>;
+
+def CMP8rr : I<0x38, MRMDestReg,
+ (outs), (ins GR8 :$src1, GR8 :$src2),
+ "cmp{b}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp GR8:$src1, GR8:$src2))]>;
+def CMP16rr : I<0x39, MRMDestReg,
+ (outs), (ins GR16:$src1, GR16:$src2),
+ "cmp{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp GR16:$src1, GR16:$src2))]>, OpSize;
+def CMP32rr : I<0x39, MRMDestReg,
+ (outs), (ins GR32:$src1, GR32:$src2),
+ "cmp{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp GR32:$src1, GR32:$src2))]>;
+def CMP8mr : I<0x38, MRMDestMem,
+ (outs), (ins i8mem :$src1, GR8 :$src2),
+ "cmp{b}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (loadi8 addr:$src1), GR8:$src2))]>;
+def CMP16mr : I<0x39, MRMDestMem,
+ (outs), (ins i16mem:$src1, GR16:$src2),
+ "cmp{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (loadi16 addr:$src1), GR16:$src2))]>,
+ OpSize;
+def CMP32mr : I<0x39, MRMDestMem,
+ (outs), (ins i32mem:$src1, GR32:$src2),
+ "cmp{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (loadi32 addr:$src1), GR32:$src2))]>;
+def CMP8rm : I<0x3A, MRMSrcMem,
+ (outs), (ins GR8 :$src1, i8mem :$src2),
+ "cmp{b}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp GR8:$src1, (loadi8 addr:$src2)))]>;
+def CMP16rm : I<0x3B, MRMSrcMem,
+ (outs), (ins GR16:$src1, i16mem:$src2),
+ "cmp{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp GR16:$src1, (loadi16 addr:$src2)))]>,
+ OpSize;
+def CMP32rm : I<0x3B, MRMSrcMem,
+ (outs), (ins GR32:$src1, i32mem:$src2),
+ "cmp{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp GR32:$src1, (loadi32 addr:$src2)))]>;
+
+// These are alternate spellings for use by the disassembler, we mark them as
+// code gen only to ensure they aren't matched by the assembler.
+let isCodeGenOnly = 1 in {
+ def CMP8rr_alt : I<0x3A, MRMSrcReg, (outs), (ins GR8:$src1, GR8:$src2),
+ "cmp{b}\t{$src2, $src1|$src1, $src2}", []>;
+ def CMP16rr_alt : I<0x3B, MRMSrcReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "cmp{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize;
+ def CMP32rr_alt : I<0x3B, MRMSrcReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "cmp{l}\t{$src2, $src1|$src1, $src2}", []>;
+}
+
+def CMP8ri : Ii8<0x80, MRM7r,
+ (outs), (ins GR8:$src1, i8imm:$src2),
+ "cmp{b}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp GR8:$src1, imm:$src2))]>;
+def CMP16ri : Ii16<0x81, MRM7r,
+ (outs), (ins GR16:$src1, i16imm:$src2),
+ "cmp{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp GR16:$src1, imm:$src2))]>, OpSize;
+def CMP32ri : Ii32<0x81, MRM7r,
+ (outs), (ins GR32:$src1, i32imm:$src2),
+ "cmp{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp GR32:$src1, imm:$src2))]>;
+def CMP8mi : Ii8 <0x80, MRM7m,
+ (outs), (ins i8mem :$src1, i8imm :$src2),
+ "cmp{b}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (loadi8 addr:$src1), imm:$src2))]>;
+def CMP16mi : Ii16<0x81, MRM7m,
+ (outs), (ins i16mem:$src1, i16imm:$src2),
+ "cmp{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (loadi16 addr:$src1), imm:$src2))]>,
+ OpSize;
+def CMP32mi : Ii32<0x81, MRM7m,
+ (outs), (ins i32mem:$src1, i32imm:$src2),
+ "cmp{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (loadi32 addr:$src1), imm:$src2))]>;
+def CMP16ri8 : Ii8<0x83, MRM7r,
+ (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "cmp{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp GR16:$src1, i16immSExt8:$src2))]>,
+ OpSize;
+def CMP16mi8 : Ii8<0x83, MRM7m,
+ (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "cmp{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (loadi16 addr:$src1),
+ i16immSExt8:$src2))]>, OpSize;
+def CMP32mi8 : Ii8<0x83, MRM7m,
+ (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "cmp{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp (loadi32 addr:$src1),
+ i32immSExt8:$src2))]>;
+def CMP32ri8 : Ii8<0x83, MRM7r,
+ (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "cmp{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp GR32:$src1, i32immSExt8:$src2))]>;
+} // Defs = [EFLAGS]
+
+// Bit tests.
+// TODO: BTC, BTR, and BTS
+let Defs = [EFLAGS] in {
+def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))]>, OpSize, TB;
+def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))]>, TB;
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Make these instructions disassembly
+// only for now.
+
+def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+// [(X86bt (loadi16 addr:$src1), GR16:$src2),
+// (implicit EFLAGS)]
+ []
+ >, OpSize, TB, Requires<[FastBTMem]>;
+def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+// [(X86bt (loadi32 addr:$src1), GR32:$src2),
+// (implicit EFLAGS)]
+ []
+ >, TB, Requires<[FastBTMem]>;
+
+def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>,
+ OpSize, TB;
+def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))]>, TB;
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
+ ]>, OpSize, TB;
+def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2))
+ ]>, TB;
+
+def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+
+def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+
+def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+ "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+ "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // Defs = [EFLAGS]
+
+// Sign/Zero extenders
+// Use movsbl intead of movsbw; we don't care about the high 16 bits
+// of the register here. This has a smaller encoding and avoids a
+// partial-register update. Actual movsbw included for the disassembler.
+def MOVSX16rr8W : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+ "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def MOVSX16rm8W : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+ "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src),
+ "", [(set GR16:$dst, (sext GR8:$src))]>, TB;
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src),
+ "", [(set GR16:$dst, (sextloadi16i8 addr:$src))]>, TB;
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sext GR8:$src))]>, TB;
+def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+ "movs{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sextloadi32i8 addr:$src))]>, TB;
+def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+ "movs{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sext GR16:$src))]>, TB;
+def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "movs{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB;
+
+// Use movzbl intead of movzbw; we don't care about the high 16 bits
+// of the register here. This has a smaller encoding and avoids a
+// partial-register update. Actual movzbw included for the disassembler.
+def MOVZX16rr8W : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+ "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def MOVZX16rm8W : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+ "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src),
+ "", [(set GR16:$dst, (zext GR8:$src))]>, TB;
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src),
+ "", [(set GR16:$dst, (zextloadi16i8 addr:$src))]>, TB;
+def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zext GR8:$src))]>, TB;
+def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zextloadi32i8 addr:$src))]>, TB;
+def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+ "movz{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zext GR16:$src))]>, TB;
+def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "movz{wl|x}\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zextloadi32i16 addr:$src))]>, TB;
+
+// These are the same as the regular MOVZX32rr8 and MOVZX32rm8
+// except that they use GR32_NOREX for the output operand register class
+// instead of GR32. This allows them to operate on h registers on x86-64.
+def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
+ (outs GR32_NOREX:$dst), (ins GR8:$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
+ []>, TB;
+let mayLoad = 1 in
+def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
+ (outs GR32_NOREX:$dst), (ins i8mem:$src),
+ "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX",
+ []>, TB;
+
+let neverHasSideEffects = 1 in {
+ let Defs = [AX], Uses = [AL] in
+ def CBW : I<0x98, RawFrm, (outs), (ins),
+ "{cbtw|cbw}", []>, OpSize; // AX = signext(AL)
+ let Defs = [EAX], Uses = [AX] in
+ def CWDE : I<0x98, RawFrm, (outs), (ins),
+ "{cwtl|cwde}", []>; // EAX = signext(AX)
+
+ let Defs = [AX,DX], Uses = [AX] in
+ def CWD : I<0x99, RawFrm, (outs), (ins),
+ "{cwtd|cwd}", []>, OpSize; // DX:AX = signext(AX)
+ let Defs = [EAX,EDX], Uses = [EAX] in
+ def CDQ : I<0x99, RawFrm, (outs), (ins),
+ "{cltd|cdq}", []>; // EDX:EAX = signext(EAX)
+}
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+// FIXME: Set encoding to pseudo.
+let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
+ isCodeGenOnly = 1 in {
+def MOV8r0 : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins), "",
+ [(set GR8:$dst, 0)]>;
+
+// We want to rewrite MOV16r0 in terms of MOV32r0, because it's a smaller
+// encoding and avoids a partial-register update sometimes, but doing so
+// at isel time interferes with rematerialization in the current register
+// allocator. For now, this is rewritten when the instruction is lowered
+// to an MCInst.
+def MOV16r0 : I<0x31, MRMInitReg, (outs GR16:$dst), (ins),
+ "",
+ [(set GR16:$dst, 0)]>, OpSize;
+
+// FIXME: Set encoding to pseudo.
+def MOV32r0 : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "",
+ [(set GR32:$dst, 0)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//
+
+// All calls clobber the non-callee saved registers. ESP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+ Uses = [ESP] in
+def TLS_addr32 : I<0, Pseudo, (outs), (ins lea32mem:$sym),
+ "leal\t$sym, %eax; "
+ "call\t___tls_get_addr@PLT",
+ [(X86tlsaddr tls32addr:$sym)]>,
+ Requires<[In32BitMode]>;
+
+let AddedComplexity = 5, isCodeGenOnly = 1 in
+def GS_MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "movl\t%gs:$src, $dst",
+ [(set GR32:$dst, (gsload addr:$src))]>, SegGS;
+
+let AddedComplexity = 5, isCodeGenOnly = 1 in
+def FS_MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "movl\t%fs:$src, $dst",
+ [(set GR32:$dst, (fsload addr:$src))]>, SegFS;
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
+ "ret\t#eh_return, addr: $addr",
+ [(X86ehret GR32:$addr)]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Atomic support
+//
+
+// Atomic swap. These are just normal xchg instructions. But since a memory
+// operand is referenced, the atomicity is ensured.
+let Constraints = "$val = $dst" in {
+def XCHG32rm : I<0x87, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$val, i32mem:$ptr),
+ "xchg{l}\t{$val, $ptr|$ptr, $val}",
+ [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))]>;
+def XCHG16rm : I<0x87, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$val, i16mem:$ptr),
+ "xchg{w}\t{$val, $ptr|$ptr, $val}",
+ [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))]>,
+ OpSize;
+def XCHG8rm : I<0x86, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr),
+ "xchg{b}\t{$val, $ptr|$ptr, $val}",
+ [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))]>;
+
+def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
+ "xchg{l}\t{$val, $src|$src, $val}", []>;
+def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
+ "xchg{w}\t{$val, $src|$src, $val}", []>, OpSize;
+def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
+ "xchg{b}\t{$val, $src|$src, $val}", []>;
+}
+
+def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
+ "xchg{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
+ "xchg{l}\t{$src, %eax|%eax, $src}", []>;
+
+// Atomic compare and swap.
+let Defs = [EAX, EFLAGS], Uses = [EAX] in {
+def LCMPXCHG32 : I<0xB1, MRMDestMem, (outs), (ins i32mem:$ptr, GR32:$swap),
+ "lock\n\t"
+ "cmpxchg{l}\t{$swap, $ptr|$ptr, $swap}",
+ [(X86cas addr:$ptr, GR32:$swap, 4)]>, TB, LOCK;
+}
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in {
+def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
+ "lock\n\t"
+ "cmpxchg8b\t$ptr",
+ [(X86cas8 addr:$ptr)]>, TB, LOCK;
+}
+
+let Defs = [AX, EFLAGS], Uses = [AX] in {
+def LCMPXCHG16 : I<0xB1, MRMDestMem, (outs), (ins i16mem:$ptr, GR16:$swap),
+ "lock\n\t"
+ "cmpxchg{w}\t{$swap, $ptr|$ptr, $swap}",
+ [(X86cas addr:$ptr, GR16:$swap, 2)]>, TB, OpSize, LOCK;
+}
+let Defs = [AL, EFLAGS], Uses = [AL] in {
+def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap),
+ "lock\n\t"
+ "cmpxchg{b}\t{$swap, $ptr|$ptr, $swap}",
+ [(X86cas addr:$ptr, GR8:$swap, 1)]>, TB, LOCK;
+}
+
+// Atomic exchange and add
+let Constraints = "$val = $dst", Defs = [EFLAGS] in {
+def LXADD32 : I<0xC1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$val, i32mem:$ptr),
+ "lock\n\t"
+ "xadd{l}\t{$val, $ptr|$ptr, $val}",
+ [(set GR32:$dst, (atomic_load_add_32 addr:$ptr, GR32:$val))]>,
+ TB, LOCK;
+def LXADD16 : I<0xC1, MRMSrcMem, (outs GR16:$dst), (ins GR16:$val, i16mem:$ptr),
+ "lock\n\t"
+ "xadd{w}\t{$val, $ptr|$ptr, $val}",
+ [(set GR16:$dst, (atomic_load_add_16 addr:$ptr, GR16:$val))]>,
+ TB, OpSize, LOCK;
+def LXADD8 : I<0xC0, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr),
+ "lock\n\t"
+ "xadd{b}\t{$val, $ptr|$ptr, $val}",
+ [(set GR8:$dst, (atomic_load_add_8 addr:$ptr, GR8:$val))]>,
+ TB, LOCK;
+}
+
+def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+ "xadd{b}\t{$src, $dst|$dst, $src}", []>, TB;
+def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "xadd{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "xadd{l}\t{$src, $dst|$dst, $src}", []>, TB;
+
+let mayLoad = 1, mayStore = 1 in {
+def XADD8rm : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+ "xadd{b}\t{$src, $dst|$dst, $src}", []>, TB;
+def XADD16rm : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "xadd{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def XADD32rm : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "xadd{l}\t{$src, $dst|$dst, $src}", []>, TB;
+}
+
+def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+ "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB;
+def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+ "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB;
+
+let mayLoad = 1, mayStore = 1 in {
+def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+ "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB;
+def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+ "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB;
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
+def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
+ "cmpxchg8b\t$dst", []>, TB;
+
+// Optimized codegen when the non-memory output is not used.
+// FIXME: Use normal add / sub instructions and add lock prefix dynamically.
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1 in {
+def LOCK_ADD8mr : I<0x00, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+ "lock\n\t"
+ "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD16mr : I<0x01, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "lock\n\t"
+ "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
+def LOCK_ADD32mr : I<0x01, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "lock\n\t"
+ "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD8mi : Ii8<0x80, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src2),
+ "lock\n\t"
+ "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD16mi : Ii16<0x81, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src2),
+ "lock\n\t"
+ "add{w}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD32mi : Ii32<0x81, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src2),
+ "lock\n\t"
+ "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD16mi8 : Ii8<0x83, MRM0m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+ "lock\n\t"
+ "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
+def LOCK_ADD32mi8 : Ii8<0x83, MRM0m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+ "lock\n\t"
+ "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+
+def LOCK_INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst),
+ "lock\n\t"
+ "inc{b}\t$dst", []>, LOCK;
+def LOCK_INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst),
+ "lock\n\t"
+ "inc{w}\t$dst", []>, OpSize, LOCK;
+def LOCK_INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst),
+ "lock\n\t"
+ "inc{l}\t$dst", []>, LOCK;
+
+def LOCK_SUB8mr : I<0x28, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src2),
+ "lock\n\t"
+ "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB16mr : I<0x29, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ "lock\n\t"
+ "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
+def LOCK_SUB32mr : I<0x29, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ "lock\n\t"
+ "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB8mi : Ii8<0x80, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src2),
+ "lock\n\t"
+ "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB16mi : Ii16<0x81, MRM5m, (outs), (ins i16mem:$dst, i16imm:$src2),
+ "lock\n\t"
+ "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
+def LOCK_SUB32mi : Ii32<0x81, MRM5m, (outs), (ins i32mem:$dst, i32imm:$src2),
+ "lock\n\t"
+ "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB16mi8 : Ii8<0x83, MRM5m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+ "lock\n\t"
+ "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
+def LOCK_SUB32mi8 : Ii8<0x83, MRM5m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+ "lock\n\t"
+ "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+
+def LOCK_DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst),
+ "lock\n\t"
+ "dec{b}\t$dst", []>, LOCK;
+def LOCK_DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst),
+ "lock\n\t"
+ "dec{w}\t$dst", []>, OpSize, LOCK;
+def LOCK_DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst),
+ "lock\n\t"
+ "dec{l}\t$dst", []>, LOCK;
+}
+
+// Atomic exchange, and, or, xor
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+ usesCustomInserter = 1 in {
+def ATOMAND32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+ "#ATOMAND32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_and_32 addr:$ptr, GR32:$val))]>;
+def ATOMOR32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+ "#ATOMOR32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_or_32 addr:$ptr, GR32:$val))]>;
+def ATOMXOR32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+ "#ATOMXOR32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_xor_32 addr:$ptr, GR32:$val))]>;
+def ATOMNAND32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+ "#ATOMNAND32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_nand_32 addr:$ptr, GR32:$val))]>;
+def ATOMMIN32: I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val),
+ "#ATOMMIN32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_min_32 addr:$ptr, GR32:$val))]>;
+def ATOMMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+ "#ATOMMAX32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_max_32 addr:$ptr, GR32:$val))]>;
+def ATOMUMIN32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+ "#ATOMUMIN32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_umin_32 addr:$ptr, GR32:$val))]>;
+def ATOMUMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+ "#ATOMUMAX32 PSEUDO!",
+ [(set GR32:$dst, (atomic_load_umax_32 addr:$ptr, GR32:$val))]>;
+
+def ATOMAND16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+ "#ATOMAND16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_and_16 addr:$ptr, GR16:$val))]>;
+def ATOMOR16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+ "#ATOMOR16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_or_16 addr:$ptr, GR16:$val))]>;
+def ATOMXOR16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+ "#ATOMXOR16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_xor_16 addr:$ptr, GR16:$val))]>;
+def ATOMNAND16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+ "#ATOMNAND16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_nand_16 addr:$ptr, GR16:$val))]>;
+def ATOMMIN16: I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$ptr, GR16:$val),
+ "#ATOMMIN16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_min_16 addr:$ptr, GR16:$val))]>;
+def ATOMMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+ "#ATOMMAX16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_max_16 addr:$ptr, GR16:$val))]>;
+def ATOMUMIN16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+ "#ATOMUMIN16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_umin_16 addr:$ptr, GR16:$val))]>;
+def ATOMUMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+ "#ATOMUMAX16 PSEUDO!",
+ [(set GR16:$dst, (atomic_load_umax_16 addr:$ptr, GR16:$val))]>;
+
+def ATOMAND8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+ "#ATOMAND8 PSEUDO!",
+ [(set GR8:$dst, (atomic_load_and_8 addr:$ptr, GR8:$val))]>;
+def ATOMOR8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+ "#ATOMOR8 PSEUDO!",
+ [(set GR8:$dst, (atomic_load_or_8 addr:$ptr, GR8:$val))]>;
+def ATOMXOR8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+ "#ATOMXOR8 PSEUDO!",
+ [(set GR8:$dst, (atomic_load_xor_8 addr:$ptr, GR8:$val))]>;
+def ATOMNAND8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+ "#ATOMNAND8 PSEUDO!",
+ [(set GR8:$dst, (atomic_load_nand_8 addr:$ptr, GR8:$val))]>;
+}
+
+let Constraints = "$val1 = $dst1, $val2 = $dst2",
+ Defs = [EFLAGS, EAX, EBX, ECX, EDX],
+ Uses = [EAX, EBX, ECX, EDX],
+ mayLoad = 1, mayStore = 1,
+ usesCustomInserter = 1 in {
+def ATOMAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ "#ATOMAND6432 PSEUDO!", []>;
+def ATOMOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ "#ATOMOR6432 PSEUDO!", []>;
+def ATOMXOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ "#ATOMXOR6432 PSEUDO!", []>;
+def ATOMNAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ "#ATOMNAND6432 PSEUDO!", []>;
+def ATOMADD6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ "#ATOMADD6432 PSEUDO!", []>;
+def ATOMSUB6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ "#ATOMSUB6432 PSEUDO!", []>;
+def ATOMSWAP6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ "#ATOMSWAP6432 PSEUDO!", []>;
+}
+
+// Segmentation support instructions.
+
+def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+
+// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
+def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+ "lar{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "lar{l}\t{$src, $dst|$dst, $src}", []>, TB;
+
+def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+ "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+ "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB;
+
+def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB;
+
+def STRr : I<0x00, MRM1r, (outs GR16:$dst), (ins),
+ "str{w}\t{$dst}", []>, TB;
+def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins),
+ "str{w}\t{$dst}", []>, TB;
+def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
+ "ltr{w}\t{$src}", []>, TB;
+def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
+ "ltr{w}\t{$src}", []>, TB;
+
+def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
+ "push{w}\t%fs", []>, OpSize, TB;
+def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
+ "push{l}\t%fs", []>, TB;
+def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
+ "push{w}\t%gs", []>, OpSize, TB;
+def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
+ "push{l}\t%gs", []>, TB;
+
+def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
+ "pop{w}\t%fs", []>, OpSize, TB;
+def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
+ "pop{l}\t%fs", []>, TB;
+def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
+ "pop{w}\t%gs", []>, OpSize, TB;
+def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
+ "pop{l}\t%gs", []>, TB;
+
+def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lds{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lds{l}\t{$src, $dst|$dst, $src}", []>;
+def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lss{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lss{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "les{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "les{l}\t{$src, $dst|$dst, $src}", []>;
+def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lfs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lfs{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+ "lgs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+ "lgs{l}\t{$src, $dst|$dst, $src}", []>, TB;
+
+def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
+ "verr\t$seg", []>, TB;
+def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
+ "verr\t$seg", []>, TB;
+def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
+ "verw\t$seg", []>, TB;
+def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
+ "verw\t$seg", []>, TB;
+
+// Descriptor-table support instructions
+
+def SGDTm : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
+ "sgdt\t$dst", []>, TB;
+def SIDTm : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
+ "sidt\t$dst", []>, TB;
+def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
+ "sldt{w}\t$dst", []>, TB;
+def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins),
+ "sldt{w}\t$dst", []>, TB;
+def LGDTm : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+ "lgdt\t$src", []>, TB;
+def LIDTm : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+ "lidt\t$src", []>, TB;
+def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
+ "lldt{w}\t$src", []>, TB;
+def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
+ "lldt{w}\t$src", []>, TB;
+
+// Lock instruction prefix
+def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>;
+
+// Repeat string operation instruction prefixes
+// These uses the DF flag in the EFLAGS register to inc or dec ECX
+let Defs = [ECX], Uses = [ECX,EFLAGS] in {
+// Repeat (used with INS, OUTS, MOVS, LODS and STOS)
+def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>;
+// Repeat while not equal (used with CMPS and SCAS)
+def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>;
+}
+
+// Segment override instruction prefixes
+def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>;
+def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>;
+def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>;
+def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>;
+def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>;
+def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
+
+// String manipulation instructions
+
+def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", []>;
+def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", []>, OpSize;
+def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", []>;
+
+def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", []>;
+def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", []>, OpSize;
+def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", []>;
+
+// CPU flow control instructions
+
+def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>;
+def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", []>, TB;
+
+// FPU control instructions
+
+def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", []>, DB;
+
+// Flag instructions
+
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", []>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", []>;
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", []>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", []>;
+def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", []>;
+def STD : I<0xFD, RawFrm, (outs), (ins), "std", []>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", []>;
+
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", []>, TB;
+
+// Table lookup instructions
+
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", []>;
+
+// Specialized register support
+
+def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", []>, TB;
+def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB;
+def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", []>, TB;
+
+def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins),
+ "smsw{w}\t$dst", []>, OpSize, TB;
+def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins),
+ "smsw{l}\t$dst", []>, TB;
+// For memory operands, there is only a 16-bit form
+def SMSW16m : I<0x01, MRM4m, (outs i16mem:$dst), (ins),
+ "smsw{w}\t$dst", []>, TB;
+
+def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
+ "lmsw{w}\t$src", []>, TB;
+def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
+ "lmsw{w}\t$src", []>, TB;
+
+def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", []>, TB;
+
+// Cache instructions
+
+def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", []>, TB;
+
+// VMX instructions
+
+// 66 0F 38 80
+def INVEPT : I<0x80, RawFrm, (outs), (ins), "invept", []>, OpSize, T8;
+// 66 0F 38 81
+def INVVPID : I<0x81, RawFrm, (outs), (ins), "invvpid", []>, OpSize, T8;
+// 0F 01 C1
+def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
+def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+ "vmclear\t$vmcs", []>, OpSize, TB;
+// 0F 01 C2
+def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
+// 0F 01 C3
+def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
+def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+ "vmptrld\t$vmcs", []>, TB;
+def VMPTRSTm : I<0xC7, MRM7m, (outs i64mem:$vmcs), (ins),
+ "vmptrst\t$vmcs", []>, TB;
+def VMREAD64rm : I<0x78, MRMDestMem, (outs i64mem:$dst), (ins GR64:$src),
+ "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+ "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def VMREAD32rm : I<0x78, MRMDestMem, (outs i32mem:$dst), (ins GR32:$src),
+ "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+ "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB;
+// 0F 01 C4
+def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
+def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
+ "vmxon\t{$vmxon}", []>, XS;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i32 (X86Wrapper tconstpool :$dst)), (MOV32ri tconstpool :$dst)>;
+def : Pat<(i32 (X86Wrapper tjumptable :$dst)), (MOV32ri tjumptable :$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
+def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
+def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>;
+
+def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
+ (ADD32ri GR32:$src1, tconstpool:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
+ (ADD32ri GR32:$src1, tjumptable:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
+ (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
+ (ADD32ri GR32:$src1, texternalsym:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)),
+ (ADD32ri GR32:$src1, tblockaddress:$src2)>;
+
+def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+ (MOV32mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
+ (MOV32mi addr:$dst, texternalsym:$src)>;
+def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst),
+ (MOV32mi addr:$dst, tblockaddress:$src)>;
+
+// Calls
+// tailcall stuff
+def : Pat<(X86tcret GR32_TC:$dst, imm:$off),
+ (TCRETURNri GR32_TC:$dst, imm:$off)>,
+ Requires<[In32BitMode]>;
+
+// FIXME: This is disabled for 32-bit PIC mode because the global base
+// register which is part of the address mode may be assigned a
+// callee-saved register.
+def : Pat<(X86tcret (load addr:$dst), imm:$off),
+ (TCRETURNmi addr:$dst, imm:$off)>,
+ Requires<[In32BitMode, IsNotPIC]>;
+
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
+ (TCRETURNdi texternalsym:$dst, imm:$off)>,
+ Requires<[In32BitMode]>;
+
+def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
+ (TCRETURNdi texternalsym:$dst, imm:$off)>,
+ Requires<[In32BitMode]>;
+
+// Normal calls, with various flavors of addresses.
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+ (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+ (CALLpcrel32 texternalsym:$dst)>;
+def : Pat<(X86call (i32 imm:$dst)),
+ (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
+
+// X86 specific add which produces a flag.
+def : Pat<(addc GR32:$src1, GR32:$src2),
+ (ADD32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(addc GR32:$src1, (load addr:$src2)),
+ (ADD32rm GR32:$src1, addr:$src2)>;
+def : Pat<(addc GR32:$src1, imm:$src2),
+ (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(addc GR32:$src1, i32immSExt8:$src2),
+ (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+def : Pat<(subc GR32:$src1, GR32:$src2),
+ (SUB32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(subc GR32:$src1, (load addr:$src2)),
+ (SUB32rm GR32:$src1, addr:$src2)>;
+def : Pat<(subc GR32:$src1, imm:$src2),
+ (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(subc GR32:$src1, i32immSExt8:$src2),
+ (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(X86cmp GR8:$src1, 0),
+ (TEST8rr GR8:$src1, GR8:$src1)>;
+def : Pat<(X86cmp GR16:$src1, 0),
+ (TEST16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(X86cmp GR32:$src1, 0),
+ (TEST32rr GR32:$src1, GR32:$src1)>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_B, EFLAGS),
+ (CMOVAE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_B, EFLAGS),
+ (CMOVAE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_AE, EFLAGS),
+ (CMOVB16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_AE, EFLAGS),
+ (CMOVB32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_E, EFLAGS),
+ (CMOVNE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_E, EFLAGS),
+ (CMOVNE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NE, EFLAGS),
+ (CMOVE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NE, EFLAGS),
+ (CMOVE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_BE, EFLAGS),
+ (CMOVA16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_BE, EFLAGS),
+ (CMOVA32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_A, EFLAGS),
+ (CMOVBE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_A, EFLAGS),
+ (CMOVBE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_L, EFLAGS),
+ (CMOVGE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_L, EFLAGS),
+ (CMOVGE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_GE, EFLAGS),
+ (CMOVL16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_GE, EFLAGS),
+ (CMOVL32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_LE, EFLAGS),
+ (CMOVG16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_LE, EFLAGS),
+ (CMOVG32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_G, EFLAGS),
+ (CMOVLE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_G, EFLAGS),
+ (CMOVLE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_P, EFLAGS),
+ (CMOVNP16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_P, EFLAGS),
+ (CMOVNP32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NP, EFLAGS),
+ (CMOVP16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NP, EFLAGS),
+ (CMOVP32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_S, EFLAGS),
+ (CMOVNS16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_S, EFLAGS),
+ (CMOVNS32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NS, EFLAGS),
+ (CMOVS16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NS, EFLAGS),
+ (CMOVS32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_O, EFLAGS),
+ (CMOVNO16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_O, EFLAGS),
+ (CMOVNO32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NO, EFLAGS),
+ (CMOVO16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NO, EFLAGS),
+ (CMOVO32rm GR32:$src2, addr:$src1)>;
+
+// zextload bool -> zextload byte
+def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>;
+def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+
+// extload bool -> extload byte
+def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>;
+def : Pat<(extloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
+
+// anyext. Define these to do an explicit zero-extend to
+// avoid partial-register updates.
+def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8 GR8 :$src)>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>;
+
+// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
+def : Pat<(i32 (anyext GR16:$src)),
+ (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;
+
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR16:$src1, 128),
+ (SUB16ri8 GR16:$src1, -128)>;
+def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
+ (SUB16mi8 addr:$dst, -128)>;
+def : Pat<(add GR32:$src1, 128),
+ (SUB32ri8 GR32:$src1, -128)>;
+def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
+ (SUB32mi8 addr:$dst, -128)>;
+
+// r & (2^16-1) ==> movz
+def : Pat<(and GR32:$src1, 0xffff),
+ (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1,
+ GR32_ABCD)),
+ sub_8bit))>,
+ Requires<[In32BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+ (MOVZX16rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src1,
+ GR16_ABCD)),
+ sub_8bit))>,
+ Requires<[In32BitMode]>;
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR32:$src, i16),
+ (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+ (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit))>,
+ Requires<[In32BitMode]>;
+def : Pat<(sext_inreg GR16:$src, i8),
+ (MOVSX16rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+ GR16_ABCD)),
+ sub_8bit))>,
+ Requires<[In32BitMode]>;
+
+// trunc patterns
+def : Pat<(i16 (trunc GR32:$src)),
+ (EXTRACT_SUBREG GR32:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit)>,
+ Requires<[In32BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit)>,
+ Requires<[In32BitMode]>;
+
+// h-register tricks
+def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)>,
+ Requires<[In32BitMode]>;
+def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
+ (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+ sub_8bit_hi)>,
+ Requires<[In32BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+ (EXTRACT_SUBREG
+ (MOVZX32rr8
+ (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+ sub_8bit_hi)),
+ sub_16bit)>,
+ Requires<[In32BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+ GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In32BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+ GR16_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In32BitMode]>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In32BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+ (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+ GR32_ABCD)),
+ sub_8bit_hi))>,
+ Requires<[In32BitMode]>;
+
+// (shl x, 1) ==> (add x, x)
+def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>;
+def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
+
+// (shl x (and y, 31)) ==> (shl x, y)
+def : Pat<(shl GR8:$src1, (and CL, 31)),
+ (SHL8rCL GR8:$src1)>;
+def : Pat<(shl GR16:$src1, (and CL, 31)),
+ (SHL16rCL GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (and CL, 31)),
+ (SHL32rCL GR32:$src1)>;
+def : Pat<(store (shl (loadi8 addr:$dst), (and CL, 31)), addr:$dst),
+ (SHL8mCL addr:$dst)>;
+def : Pat<(store (shl (loadi16 addr:$dst), (and CL, 31)), addr:$dst),
+ (SHL16mCL addr:$dst)>;
+def : Pat<(store (shl (loadi32 addr:$dst), (and CL, 31)), addr:$dst),
+ (SHL32mCL addr:$dst)>;
+
+def : Pat<(srl GR8:$src1, (and CL, 31)),
+ (SHR8rCL GR8:$src1)>;
+def : Pat<(srl GR16:$src1, (and CL, 31)),
+ (SHR16rCL GR16:$src1)>;
+def : Pat<(srl GR32:$src1, (and CL, 31)),
+ (SHR32rCL GR32:$src1)>;
+def : Pat<(store (srl (loadi8 addr:$dst), (and CL, 31)), addr:$dst),
+ (SHR8mCL addr:$dst)>;
+def : Pat<(store (srl (loadi16 addr:$dst), (and CL, 31)), addr:$dst),
+ (SHR16mCL addr:$dst)>;
+def : Pat<(store (srl (loadi32 addr:$dst), (and CL, 31)), addr:$dst),
+ (SHR32mCL addr:$dst)>;
+
+def : Pat<(sra GR8:$src1, (and CL, 31)),
+ (SAR8rCL GR8:$src1)>;
+def : Pat<(sra GR16:$src1, (and CL, 31)),
+ (SAR16rCL GR16:$src1)>;
+def : Pat<(sra GR32:$src1, (and CL, 31)),
+ (SAR32rCL GR32:$src1)>;
+def : Pat<(store (sra (loadi8 addr:$dst), (and CL, 31)), addr:$dst),
+ (SAR8mCL addr:$dst)>;
+def : Pat<(store (sra (loadi16 addr:$dst), (and CL, 31)), addr:$dst),
+ (SAR16mCL addr:$dst)>;
+def : Pat<(store (sra (loadi32 addr:$dst), (and CL, 31)), addr:$dst),
+ (SAR32mCL addr:$dst)>;
+
+// (anyext (setcc_carry)) -> (setcc_carry)
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C32r)>;
+def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
+ (SETB_C32r)>;
+
+// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
+let AddedComplexity = 5 in { // Try this before the selecting to OR
+def : Pat<(or_is_add GR16:$src1, imm:$src2),
+ (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(or_is_add GR32:$src1, imm:$src2),
+ (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(or_is_add GR16:$src1, i16immSExt8:$src2),
+ (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(or_is_add GR32:$src1, i32immSExt8:$src2),
+ (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(or_is_add GR16:$src1, GR16:$src2),
+ (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(or_is_add GR32:$src1, GR32:$src2),
+ (ADD32rr GR32:$src1, GR32:$src2)>;
+} // AddedComplexity
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// add reg, reg
+def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
+
+// add reg, mem
+def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
+ (ADD8rm GR8:$src1, addr:$src2)>;
+def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
+ (ADD16rm GR16:$src1, addr:$src2)>;
+def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
+ (ADD32rm GR32:$src1, addr:$src2)>;
+
+// add reg, imm
+def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>;
+def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(add GR16:$src1, i16immSExt8:$src2),
+ (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(add GR32:$src1, i32immSExt8:$src2),
+ (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// sub reg, reg
+def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
+
+// sub reg, mem
+def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
+ (SUB8rm GR8:$src1, addr:$src2)>;
+def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
+ (SUB16rm GR16:$src1, addr:$src2)>;
+def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
+ (SUB32rm GR32:$src1, addr:$src2)>;
+
+// sub reg, imm
+def : Pat<(sub GR8:$src1, imm:$src2),
+ (SUB8ri GR8:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, imm:$src2),
+ (SUB16ri GR16:$src1, imm:$src2)>;
+def : Pat<(sub GR32:$src1, imm:$src2),
+ (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
+ (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
+ (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// mul reg, reg
+def : Pat<(mul GR16:$src1, GR16:$src2),
+ (IMUL16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(mul GR32:$src1, GR32:$src2),
+ (IMUL32rr GR32:$src1, GR32:$src2)>;
+
+// mul reg, mem
+def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
+ (IMUL16rm GR16:$src1, addr:$src2)>;
+def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
+ (IMUL32rm GR32:$src1, addr:$src2)>;
+
+// mul reg, imm
+def : Pat<(mul GR16:$src1, imm:$src2),
+ (IMUL16rri GR16:$src1, imm:$src2)>;
+def : Pat<(mul GR32:$src1, imm:$src2),
+ (IMUL32rri GR32:$src1, imm:$src2)>;
+def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
+ (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
+ (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// reg = mul mem, imm
+def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
+ (IMUL16rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
+ (IMUL32rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
+ (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
+ (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
+
+// Optimize multiply by 2 with EFLAGS result.
+let AddedComplexity = 2 in {
+def : Pat<(X86smul_flag GR16:$src1, 2), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(X86smul_flag GR32:$src1, 2), (ADD32rr GR32:$src1, GR32:$src1)>;
+}
+
+// Patterns for nodes that do not produce flags, for instructions that do.
+
+// Increment reg.
+def : Pat<(add GR8:$src , 1), (INC8r GR8:$src)>;
+def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>, Requires<[In32BitMode]>;
+
+// Decrement reg.
+def : Pat<(add GR8:$src , -1), (DEC8r GR8:$src)>;
+def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>, Requires<[In32BitMode]>;
+
+// or reg/reg.
+def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
+
+// or reg/mem
+def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
+ (OR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
+ (OR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
+ (OR32rm GR32:$src1, addr:$src2)>;
+
+// or reg/imm
+def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri GR8 :$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, i16immSExt8:$src2),
+ (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(or GR32:$src1, i32immSExt8:$src2),
+ (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// xor reg/reg
+def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
+
+// xor reg/mem
+def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
+ (XOR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
+ (XOR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
+ (XOR32rm GR32:$src1, addr:$src2)>;
+
+// xor reg/imm
+def : Pat<(xor GR8:$src1, imm:$src2),
+ (XOR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, imm:$src2),
+ (XOR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(xor GR32:$src1, imm:$src2),
+ (XOR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
+ (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
+ (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// and reg/reg
+def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr GR8 :$src1, GR8 :$src2)>;
+def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
+
+// and reg/mem
+def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
+ (AND8rm GR8:$src1, addr:$src2)>;
+def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
+ (AND16rm GR16:$src1, addr:$src2)>;
+def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
+ (AND32rm GR32:$src1, addr:$src2)>;
+
+// and reg/imm
+def : Pat<(and GR8:$src1, imm:$src2),
+ (AND8ri GR8:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, imm:$src2),
+ (AND16ri GR16:$src1, imm:$src2)>;
+def : Pat<(and GR32:$src1, imm:$src2),
+ (AND32ri GR32:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, i16immSExt8:$src2),
+ (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(and GR32:$src1, i32immSExt8:$src2),
+ (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Stack Support
+//===----------------------------------------------------------------------===//
+
+include "X86InstrFPStack.td"
+
+//===----------------------------------------------------------------------===//
+// X86-64 Support
+//===----------------------------------------------------------------------===//
+
+include "X86Instr64bit.td"
+
+//===----------------------------------------------------------------------===//
+// SIMD support (SSE, MMX and AVX)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrFragmentsSIMD.td"
+
+//===----------------------------------------------------------------------===//
+// XMM Floating point support (requires SSE / SSE2)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrSSE.td"
+
+//===----------------------------------------------------------------------===//
+// MMX and XMM Packed Integer support (requires MMX, SSE, and SSE2)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrMMX.td"
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
new file mode 100644
index 0000000..0952fc8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
@@ -0,0 +1,677 @@
+//====- X86InstrMMX.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MMX instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Multiclasses
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst" in {
+ // MMXI_binop_rm - Simple MMX binary operator.
+ multiclass MMXI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, bit Commutable = 0> {
+ def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (OpVT (OpNode VR64:$src1, VR64:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (OpVT (OpNode VR64:$src1,
+ (bitconvert
+ (load_mmx addr:$src2)))))]>;
+ }
+
+ multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+ bit Commutable = 0> {
+ def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))]>;
+ }
+
+ // MMXI_binop_rm_v1i64 - Simple MMX binary operator whose type is v1i64.
+ //
+ // FIXME: we could eliminate this and use MMXI_binop_rm instead if tblgen knew
+ // to collapse (bitconvert VT to VT) into its operand.
+ //
+ multiclass MMXI_binop_rm_v1i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ bit Commutable = 0> {
+ def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (v1i64 (OpNode VR64:$src1, VR64:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst,
+ (OpNode VR64:$src1,(load_mmx addr:$src2)))]>;
+ }
+
+ multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, Intrinsic IntId,
+ Intrinsic IntId2> {
+ def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>;
+ def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))]>;
+ def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
+ (ins VR64:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))]>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// MMX EMMS & FEMMS Instructions
+//===----------------------------------------------------------------------===//
+
+def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms",
+ [(int_x86_mmx_emms)]>;
+def MMX_FEMMS : MMXI<0x0E, RawFrm, (outs), (ins), "femms",
+ [(int_x86_mmx_femms)]>;
+
+//===----------------------------------------------------------------------===//
+// MMX Scalar Instructions
+//===----------------------------------------------------------------------===//
+
+// Data Transfer Instructions
+def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (v2i32 (scalar_to_vector GR32:$src)))]>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (v2i32 (scalar_to_vector (loadi32 addr:$src))))]>;
+let mayStore = 1 in
+def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}", []>;
+def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs), (ins GR32:$dst, VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}", []>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ []>;
+
+let neverHasSideEffects = 1 in
+// These are 64 bit moves, but since the OS X assembler doesn't
+// recognize a register-register movq, we write them as
+// movd.
+def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
+ (outs GR64:$dst), (ins VR64:$src),
+ "movd\t{$src, $dst|$dst, $src}", []>;
+def MMX_MOVD64rrv164 : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (v1i64 (scalar_to_vector GR64:$src)))]>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (load_mmx addr:$src))]>;
+def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (v1i64 VR64:$src), addr:$dst)]>;
+
+def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "movdq2q\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (v1i64 (bitconvert
+ (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))))]>;
+
+def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
+ "movq2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (movl immAllZerosV,
+ (v2i64 (scalar_to_vector
+ (i64 (bitconvert (v1i64 VR64:$src)))))))]>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMSrcReg, (outs FR64:$dst), (ins VR64:$src),
+ "movq2dq\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_MOVFR642Qrr: SSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins FR64:$src),
+ "movdq2q\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+ "movntq\t{$src, $dst|$dst, $src}",
+ [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>;
+
+let AddedComplexity = 15 in
+// movd to MMX register zero-extends
+def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (v2i32 (X86vzmovl (v2i32 (scalar_to_vector GR32:$src)))))]>;
+let AddedComplexity = 20 in
+def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst),
+ (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (v2i32 (X86vzmovl (v2i32
+ (scalar_to_vector (loadi32 addr:$src))))))]>;
+
+// Arithmetic Instructions
+
+// -- Addition
+defm MMX_PADDB : MMXI_binop_rm<0xFC, "paddb", add, v8i8, 1>;
+defm MMX_PADDW : MMXI_binop_rm<0xFD, "paddw", add, v4i16, 1>;
+defm MMX_PADDD : MMXI_binop_rm<0xFE, "paddd", add, v2i32, 1>;
+defm MMX_PADDQ : MMXI_binop_rm<0xD4, "paddq", add, v1i64, 1>;
+
+defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, 1>;
+defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, 1>;
+
+// -- Subtraction
+defm MMX_PSUBB : MMXI_binop_rm<0xF8, "psubb", sub, v8i8>;
+defm MMX_PSUBW : MMXI_binop_rm<0xF9, "psubw", sub, v4i16>;
+defm MMX_PSUBD : MMXI_binop_rm<0xFA, "psubd", sub, v2i32>;
+defm MMX_PSUBQ : MMXI_binop_rm<0xFB, "psubq", sub, v1i64>;
+
+defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b>;
+defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w>;
+
+// -- Multiplication
+defm MMX_PMULLW : MMXI_binop_rm<0xD5, "pmullw", mul, v4i16, 1>;
+
+defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w, 1>;
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, 1>;
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, 1>;
+
+// -- Miscellanea
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, 1>;
+
+defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, 1>;
+defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, 1>;
+
+defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, 1>;
+defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, 1>;
+
+defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, 1>;
+defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, 1>;
+
+defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, 1>;
+
+// Logical Instructions
+defm MMX_PAND : MMXI_binop_rm_v1i64<0xDB, "pand", and, 1>;
+defm MMX_POR : MMXI_binop_rm_v1i64<0xEB, "por" , or, 1>;
+defm MMX_PXOR : MMXI_binop_rm_v1i64<0xEF, "pxor", xor, 1>;
+
+let Constraints = "$src1 = $dst" in {
+ def MMX_PANDNrr : MMXI<0xDF, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ "pandn\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
+ VR64:$src2)))]>;
+ def MMX_PANDNrm : MMXI<0xDF, MRMSrcMem,
+ (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ "pandn\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
+ (load addr:$src2))))]>;
+}
+
+// Shift Instructions
+defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+ int_x86_mmx_psrl_w, int_x86_mmx_psrli_w>;
+defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+ int_x86_mmx_psrl_d, int_x86_mmx_psrli_d>;
+defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+ int_x86_mmx_psrl_q, int_x86_mmx_psrli_q>;
+
+defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+ int_x86_mmx_psll_w, int_x86_mmx_pslli_w>;
+defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+ int_x86_mmx_psll_d, int_x86_mmx_pslli_d>;
+defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+ int_x86_mmx_psll_q, int_x86_mmx_pslli_q>;
+
+defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+ int_x86_mmx_psra_w, int_x86_mmx_psrai_w>;
+defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+ int_x86_mmx_psra_d, int_x86_mmx_psrai_d>;
+
+// Shift up / down and insert zero's.
+def : Pat<(v1i64 (X86vshl VR64:$src, (i8 imm:$amt))),
+ (MMX_PSLLQri VR64:$src, (GetLo32XForm imm:$amt))>;
+def : Pat<(v1i64 (X86vshr VR64:$src, (i8 imm:$amt))),
+ (MMX_PSRLQri VR64:$src, (GetLo32XForm imm:$amt))>;
+
+// Comparison Instructions
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d>;
+
+// Conversion Instructions
+
+// -- Unpack Instructions
+let Constraints = "$src1 = $dst" in {
+ // Unpack High Packed Data Instructions
+ def MMX_PUNPCKHBWrr : MMXI<0x68, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ "punpckhbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v8i8 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
+ def MMX_PUNPCKHBWrm : MMXI<0x68, MRMSrcMem,
+ (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ "punpckhbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v8i8 (mmx_unpckh VR64:$src1,
+ (bc_v8i8 (load_mmx addr:$src2)))))]>;
+
+ def MMX_PUNPCKHWDrr : MMXI<0x69, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ "punpckhwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
+ def MMX_PUNPCKHWDrm : MMXI<0x69, MRMSrcMem,
+ (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ "punpckhwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (mmx_unpckh VR64:$src1,
+ (bc_v4i16 (load_mmx addr:$src2)))))]>;
+
+ def MMX_PUNPCKHDQrr : MMXI<0x6A, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ "punpckhdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v2i32 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
+ def MMX_PUNPCKHDQrm : MMXI<0x6A, MRMSrcMem,
+ (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ "punpckhdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v2i32 (mmx_unpckh VR64:$src1,
+ (bc_v2i32 (load_mmx addr:$src2)))))]>;
+
+ // Unpack Low Packed Data Instructions
+ def MMX_PUNPCKLBWrr : MMXI<0x60, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ "punpcklbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v8i8 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
+ def MMX_PUNPCKLBWrm : MMXI<0x60, MRMSrcMem,
+ (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ "punpcklbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v8i8 (mmx_unpckl VR64:$src1,
+ (bc_v8i8 (load_mmx addr:$src2)))))]>;
+
+ def MMX_PUNPCKLWDrr : MMXI<0x61, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ "punpcklwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
+ def MMX_PUNPCKLWDrm : MMXI<0x61, MRMSrcMem,
+ (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ "punpcklwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (mmx_unpckl VR64:$src1,
+ (bc_v4i16 (load_mmx addr:$src2)))))]>;
+
+ def MMX_PUNPCKLDQrr : MMXI<0x62, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+ "punpckldq\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v2i32 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
+ def MMX_PUNPCKLDQrm : MMXI<0x62, MRMSrcMem,
+ (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+ "punpckldq\t{$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v2i32 (mmx_unpckl VR64:$src1,
+ (bc_v2i32 (load_mmx addr:$src2)))))]>;
+}
+
+// -- Pack Instructions
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb>;
+
+// -- Shuffle Instructions
+def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
+ (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2),
+ "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (mmx_pshufw:$src2 VR64:$src1, (undef))))]>;
+def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
+ (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2),
+ "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR64:$dst,
+ (mmx_pshufw:$src2 (bc_v4i16 (load_mmx addr:$src1)),
+ (undef)))]>;
+
+// -- Conversion Instructions
+let neverHasSideEffects = 1 in {
+def MMX_CVTPD2PIrr : MMX2I<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvtpd2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPD2PIrm : MMX2I<0x2D, MRMSrcMem, (outs VR64:$dst),
+ (ins f128mem:$src),
+ "cvtpd2pi\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPI2PDrr : MMX2I<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
+ "cvtpi2pd\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPI2PDrm : MMX2I<0x2A, MRMSrcMem, (outs VR128:$dst),
+ (ins i64mem:$src),
+ "cvtpi2pd\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPI2PSrr : MMXI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
+ "cvtpi2ps\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPI2PSrm : MMXI<0x2A, MRMSrcMem, (outs VR128:$dst),
+ (ins i64mem:$src),
+ "cvtpi2ps\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPS2PIrr : MMXI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvtps2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPS2PIrm : MMXI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+ "cvtps2pi\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTTPD2PIrr : MMX2I<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvttpd2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTTPD2PIrm : MMX2I<0x2C, MRMSrcMem, (outs VR64:$dst),
+ (ins f128mem:$src),
+ "cvttpd2pi\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTTPS2PIrr : MMXI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvttps2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTTPS2PIrm : MMXI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+ "cvttps2pi\t{$src, $dst|$dst, $src}", []>;
+} // end neverHasSideEffects
+
+
+// Extract / Insert
+def MMX_X86pinsrw : SDNode<"X86ISD::MMX_PINSRW",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v4i16>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+
+
+def MMX_PEXTRWri : MMXIi8<0xC5, MRMSrcReg,
+ (outs GR32:$dst), (ins VR64:$src1, i16i8imm:$src2),
+ "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (X86pextrw (v4i16 VR64:$src1),
+ (iPTR imm:$src2)))]>;
+let Constraints = "$src1 = $dst" in {
+ def MMX_PINSRWrri : MMXIi8<0xC4, MRMSrcReg,
+ (outs VR64:$dst),
+ (ins VR64:$src1, GR32:$src2,i16i8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst, (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
+ GR32:$src2,(iPTR imm:$src3))))]>;
+ def MMX_PINSRWrmi : MMXIi8<0xC4, MRMSrcMem,
+ (outs VR64:$dst),
+ (ins VR64:$src1, i16mem:$src2, i16i8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst,
+ (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
+ (i32 (anyext (loadi16 addr:$src2))),
+ (iPTR imm:$src3))))]>;
+}
+
+// MMX to XMM for vector types
+def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
+ [SDTCisVT<0, v2i64>, SDTCisVT<1, v1i64>]>>;
+
+def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)),
+ (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+
+def : Pat<(v2i64 (MMX_X86movq2dq (load_mmx addr:$src))),
+ (v2i64 (MOVQI2PQIrm addr:$src))>;
+
+def : Pat<(v2i64 (MMX_X86movq2dq (v1i64 (bitconvert
+ (v2i32 (scalar_to_vector (loadi32 addr:$src))))))),
+ (v2i64 (MOVDI2PDIrm addr:$src))>;
+
+// Mask creation
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_mmx_pmovmskb VR64:$src))]>;
+
+// Misc.
+let Uses = [EDI] in
+def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+ "maskmovq\t{$mask, $src|$src, $mask}",
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>;
+let Uses = [RDI] in
+def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+ "maskmovq\t{$mask, $src|$src, $mask}",
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)]>;
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map zero vector to pxor.
+let isReMaterializable = 1, isCodeGenOnly = 1 in {
+ // FIXME: Change encoding to pseudo.
+ def MMX_V_SET0 : MMXI<0xEF, MRMInitReg, (outs VR64:$dst), (ins), "",
+ [(set VR64:$dst, (v2i32 immAllZerosV))]>;
+ def MMX_V_SETALLONES : MMXI<0x76, MRMInitReg, (outs VR64:$dst), (ins), "",
+ [(set VR64:$dst, (v2i32 immAllOnesV))]>;
+}
+
+let Predicates = [HasMMX] in {
+ def : Pat<(v1i64 immAllZerosV), (MMX_V_SET0)>;
+ def : Pat<(v4i16 immAllZerosV), (MMX_V_SET0)>;
+ def : Pat<(v8i8 immAllZerosV), (MMX_V_SET0)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Store 64-bit integer vector values.
+def : Pat<(store (v8i8 VR64:$src), addr:$dst),
+ (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v4i16 VR64:$src), addr:$dst),
+ (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v2i32 VR64:$src), addr:$dst),
+ (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v2f32 VR64:$src), addr:$dst),
+ (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v1i64 VR64:$src), addr:$dst),
+ (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+
+// Bit convert.
+def : Pat<(v8i8 (bitconvert (v1i64 VR64:$src))), (v8i8 VR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v2i32 VR64:$src))), (v8i8 VR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v2f32 VR64:$src))), (v8i8 VR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v4i16 VR64:$src))), (v8i8 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1i64 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1i64 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v2f32 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1i64 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v2i32 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2i32 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8 VR64:$src))), (v1i64 VR64:$src)>;
+
+// 64-bit bit convert.
+def : Pat<(v1i64 (bitconvert (i64 GR64:$src))),
+ (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v2i32 (bitconvert (i64 GR64:$src))),
+ (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v2f32 (bitconvert (i64 GR64:$src))),
+ (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v4i16 (bitconvert (i64 GR64:$src))),
+ (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v8i8 (bitconvert (i64 GR64:$src))),
+ (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(i64 (bitconvert (v1i64 VR64:$src))),
+ (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64 (bitconvert (v2i32 VR64:$src))),
+ (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64 (bitconvert (v2f32 VR64:$src))),
+ (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64 (bitconvert (v4i16 VR64:$src))),
+ (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64 (bitconvert (v8i8 VR64:$src))),
+ (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v1i64 VR64:$src))),
+ (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v2i32 VR64:$src))),
+ (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v4i16 VR64:$src))),
+ (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v8i8 VR64:$src))),
+ (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (f64 FR64:$src))),
+ (MMX_MOVFR642Qrr FR64:$src)>;
+def : Pat<(v2i32 (bitconvert (f64 FR64:$src))),
+ (MMX_MOVFR642Qrr FR64:$src)>;
+def : Pat<(v4i16 (bitconvert (f64 FR64:$src))),
+ (MMX_MOVFR642Qrr FR64:$src)>;
+def : Pat<(v8i8 (bitconvert (f64 FR64:$src))),
+ (MMX_MOVFR642Qrr FR64:$src)>;
+
+let AddedComplexity = 20 in {
+ def : Pat<(v2i32 (X86vzmovl (bc_v2i32 (load_mmx addr:$src)))),
+ (MMX_MOVZDI2PDIrm addr:$src)>;
+}
+
+// Clear top half.
+let AddedComplexity = 15 in {
+ def : Pat<(v2i32 (X86vzmovl VR64:$src)),
+ (MMX_PUNPCKLDQrr VR64:$src, (v2i32 (MMX_V_SET0)))>;
+}
+
+// Patterns to perform canonical versions of vector shuffling.
+let AddedComplexity = 10 in {
+ def : Pat<(v8i8 (mmx_unpckl_undef VR64:$src, (undef))),
+ (MMX_PUNPCKLBWrr VR64:$src, VR64:$src)>;
+ def : Pat<(v4i16 (mmx_unpckl_undef VR64:$src, (undef))),
+ (MMX_PUNPCKLWDrr VR64:$src, VR64:$src)>;
+ def : Pat<(v2i32 (mmx_unpckl_undef VR64:$src, (undef))),
+ (MMX_PUNPCKLDQrr VR64:$src, VR64:$src)>;
+}
+
+let AddedComplexity = 10 in {
+ def : Pat<(v8i8 (mmx_unpckh_undef VR64:$src, (undef))),
+ (MMX_PUNPCKHBWrr VR64:$src, VR64:$src)>;
+ def : Pat<(v4i16 (mmx_unpckh_undef VR64:$src, (undef))),
+ (MMX_PUNPCKHWDrr VR64:$src, VR64:$src)>;
+ def : Pat<(v2i32 (mmx_unpckh_undef VR64:$src, (undef))),
+ (MMX_PUNPCKHDQrr VR64:$src, VR64:$src)>;
+}
+
+// Some special case PANDN patterns.
+// FIXME: Get rid of these.
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
+ VR64:$src2)),
+ (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
+ (load addr:$src2))),
+ (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+
+// Move MMX to lower 64-bit of XMM
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v8i8 VR64:$src))))),
+ (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v4i16 VR64:$src))))),
+ (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v2i32 VR64:$src))))),
+ (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v1i64 VR64:$src))))),
+ (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+
+// Move lower 64-bit of XMM to MMX.
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))),
+ (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+
+// Patterns for vector comparisons
+def : Pat<(v8i8 (X86pcmpeqb VR64:$src1, VR64:$src2)),
+ (MMX_PCMPEQBrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v8i8 (X86pcmpeqb VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+ (MMX_PCMPEQBrm VR64:$src1, addr:$src2)>;
+def : Pat<(v4i16 (X86pcmpeqw VR64:$src1, VR64:$src2)),
+ (MMX_PCMPEQWrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v4i16 (X86pcmpeqw VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+ (MMX_PCMPEQWrm VR64:$src1, addr:$src2)>;
+def : Pat<(v2i32 (X86pcmpeqd VR64:$src1, VR64:$src2)),
+ (MMX_PCMPEQDrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v2i32 (X86pcmpeqd VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+ (MMX_PCMPEQDrm VR64:$src1, addr:$src2)>;
+
+def : Pat<(v8i8 (X86pcmpgtb VR64:$src1, VR64:$src2)),
+ (MMX_PCMPGTBrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v8i8 (X86pcmpgtb VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+ (MMX_PCMPGTBrm VR64:$src1, addr:$src2)>;
+def : Pat<(v4i16 (X86pcmpgtw VR64:$src1, VR64:$src2)),
+ (MMX_PCMPGTWrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v4i16 (X86pcmpgtw VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+ (MMX_PCMPGTWrm VR64:$src1, addr:$src2)>;
+def : Pat<(v2i32 (X86pcmpgtd VR64:$src1, VR64:$src2)),
+ (MMX_PCMPGTDrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v2i32 (X86pcmpgtd VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+ (MMX_PCMPGTDrm VR64:$src1, addr:$src2)>;
+
+// CMOV* - Used to implement the SELECT DAG operation. Expanded after
+// instruction selection into a branch sequence.
+let Uses = [EFLAGS], usesCustomInserter = 1 in {
+ def CMOV_V1I64 : I<0, Pseudo,
+ (outs VR64:$dst), (ins VR64:$t, VR64:$f, i8imm:$cond),
+ "#CMOV_V1I64 PSEUDO!",
+ [(set VR64:$dst,
+ (v1i64 (X86cmov VR64:$t, VR64:$f, imm:$cond,
+ EFLAGS)))]>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
new file mode 100644
index 0000000..5580ba7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -0,0 +1,4142 @@
+//====- X86InstrSSE.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 SSE instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
+ SDTCisFP<0>, SDTCisInt<2> ]>;
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
+ SDTCisFP<1>, SDTCisVT<3, i8>]>;
+
+def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>;
+def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>;
+def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
+def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
+def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>;
+def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>;
+def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>;
+def X86pshufb : SDNode<"X86ISD::PSHUFB",
+ SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>]>>;
+def X86pextrb : SDNode<"X86ISD::PEXTRB",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pextrw : SDNode<"X86ISD::PEXTRW",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pinsrb : SDNode<"X86ISD::PINSRB",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86pinsrw : SDNode<"X86ISD::PINSRW",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86insrtps : SDNode<"X86ISD::INSERTPS",
+ SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
+def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
+ SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad]>;
+def X86vshl : SDNode<"X86ISD::VSHL", SDTIntShiftOp>;
+def X86vshr : SDNode<"X86ISD::VSRL", SDTIntShiftOp>;
+def X86cmpps : SDNode<"X86ISD::CMPPS", SDTX86VFCMP>;
+def X86cmppd : SDNode<"X86ISD::CMPPD", SDTX86VFCMP>;
+def X86pcmpeqb : SDNode<"X86ISD::PCMPEQB", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpeqw : SDNode<"X86ISD::PCMPEQW", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpeqd : SDNode<"X86ISD::PCMPEQD", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpeqq : SDNode<"X86ISD::PCMPEQQ", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpgtb : SDNode<"X86ISD::PCMPGTB", SDTIntBinOp>;
+def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>;
+def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>;
+def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>;
+
+def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+ SDTCisVT<1, v4f32>,
+ SDTCisVT<2, v4f32>]>;
+def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
+
+//===----------------------------------------------------------------------===//
+// SSE Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// These are 'extloads' from a scalar to the low element of a vector, zeroing
+// the top elements. These are used for the SSE 'ss' and 'sd' instruction
+// forms.
+def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [],
+ [SDNPHasChain, SDNPMayLoad]>;
+def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
+ [SDNPHasChain, SDNPMayLoad]>;
+
+def ssmem : Operand<v4f32> {
+ let PrintMethod = "printf32mem";
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+def sdmem : Operand<v2f64> {
+ let PrintMethod = "printf64mem";
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
+def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+
+// Like 'store', but always requires vector alignment.
+def alignedstore : PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return cast<StoreSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'load', but always requires vector alignment.
+def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def alignedloadfsf32 : PatFrag<(ops node:$ptr),
+ (f32 (alignedload node:$ptr))>;
+def alignedloadfsf64 : PatFrag<(ops node:$ptr),
+ (f64 (alignedload node:$ptr))>;
+def alignedloadv4f32 : PatFrag<(ops node:$ptr),
+ (v4f32 (alignedload node:$ptr))>;
+def alignedloadv2f64 : PatFrag<(ops node:$ptr),
+ (v2f64 (alignedload node:$ptr))>;
+def alignedloadv4i32 : PatFrag<(ops node:$ptr),
+ (v4i32 (alignedload node:$ptr))>;
+def alignedloadv2i64 : PatFrag<(ops node:$ptr),
+ (v2i64 (alignedload node:$ptr))>;
+
+// Like 'load', but uses special alignment checks suitable for use in
+// memory operands in most SSE instructions, which are required to
+// be naturally aligned on some targets but not on others. If the subtarget
+// allows unaligned accesses, match any load, though this may require
+// setting a feature bit in the processor (on startup, for example).
+// Opteron 10h and later implement such a feature.
+def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return Subtarget->hasVectorUAMem()
+ || cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>;
+def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>;
+def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
+def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
+def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
+def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
+
+// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
+// 16-byte boundary.
+// FIXME: 8 byte alignment for mmx reads is not required
+def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
+def memopv8i8 : PatFrag<(ops node:$ptr), (v8i8 (memop64 node:$ptr))>;
+def memopv4i16 : PatFrag<(ops node:$ptr), (v4i16 (memop64 node:$ptr))>;
+def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop64 node:$ptr))>;
+def memopv2i32 : PatFrag<(ops node:$ptr), (v2i32 (memop64 node:$ptr))>;
+
+// MOVNT Support
+// Like 'store', but requires the non-temporal bit to be set
+def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+ (st node:$val, node:$ptr), [{
+ if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+ return ST->isNonTemporal();
+ return false;
+}]>;
+
+def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+ (st node:$val, node:$ptr), [{
+ if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+ return ST->isNonTemporal() && !ST->isTruncatingStore() &&
+ ST->getAddressingMode() == ISD::UNINDEXED &&
+ ST->getAlignment() >= 16;
+ return false;
+}]>;
+
+def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+ (st node:$val, node:$ptr), [{
+ if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+ return ST->isNonTemporal() &&
+ ST->getAlignment() < 16;
+ return false;
+}]>;
+
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+def vzmovl_v2i64 : PatFrag<(ops node:$src),
+ (bitconvert (v2i64 (X86vzmovl
+ (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
+def vzmovl_v4i32 : PatFrag<(ops node:$src),
+ (bitconvert (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
+
+def vzload_v2i64 : PatFrag<(ops node:$src),
+ (bitconvert (v2i64 (X86vzload node:$src)))>;
+
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+// BYTE_imm - Transform bit immediates into byte immediates.
+def BYTE_imm : SDNodeXForm<imm, [{
+ // Transformation function: imm >> 3
+ return getI32Imm(N->getZExtValue() >> 3);
+}]>;
+
+// SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*,
+// SHUFP* etc. imm.
+def SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{
+ return getI8Imm(X86::getShuffleSHUFImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to
+// PSHUFHW imm.
+def SHUFFLE_get_pshufhw_imm : SDNodeXForm<vector_shuffle, [{
+ return getI8Imm(X86::getShufflePSHUFHWImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to
+// PSHUFLW imm.
+def SHUFFLE_get_pshuflw_imm : SDNodeXForm<vector_shuffle, [{
+ return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
+}]>;
+
+// SHUFFLE_get_palign_imm xform function: convert vector_shuffle mask to
+// a PALIGNR imm.
+def SHUFFLE_get_palign_imm : SDNodeXForm<vector_shuffle, [{
+ return getI8Imm(X86::getShufflePALIGNRImmediate(N));
+}]>;
+
+def splat_lo : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+ return SVOp->isSplat() && SVOp->getSplatIndex() == 0;
+}]>;
+
+def movddup : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVDDUPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movhlps : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVHLPSMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movhlps_undef : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVHLPS_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movlhps : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVLHPSMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movlp : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVLPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movl : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVLMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movshdup : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVSHDUPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movsldup : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isMOVSLDUPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckl : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckh : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckl_undef : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKL_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckh_undef : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isUNPCKH_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def pshufd : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_shuf_imm>;
+
+def shufp : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isSHUFPMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_shuf_imm>;
+
+def pshufhw : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isPSHUFHWMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_pshufhw_imm>;
+
+def pshuflw : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isPSHUFLWMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_pshuflw_imm>;
+
+def palign : PatFrag<(ops node:$lhs, node:$rhs),
+ (vector_shuffle node:$lhs, node:$rhs), [{
+ return X86::isPALIGNRMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_palign_imm>;
+
+//===----------------------------------------------------------------------===//
+// SSE scalar FP Instructions
+//===----------------------------------------------------------------------===//
+
+// CMOV* - Used to implement the SSE SELECT DAG operation. Expanded after
+// instruction selection into a branch sequence.
+let Uses = [EFLAGS], usesCustomInserter = 1 in {
+ def CMOV_FR32 : I<0, Pseudo,
+ (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond),
+ "#CMOV_FR32 PSEUDO!",
+ [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond,
+ EFLAGS))]>;
+ def CMOV_FR64 : I<0, Pseudo,
+ (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond),
+ "#CMOV_FR64 PSEUDO!",
+ [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond,
+ EFLAGS))]>;
+ def CMOV_V4F32 : I<0, Pseudo,
+ (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+ "#CMOV_V4F32 PSEUDO!",
+ [(set VR128:$dst,
+ (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+ EFLAGS)))]>;
+ def CMOV_V2F64 : I<0, Pseudo,
+ (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+ "#CMOV_V2F64 PSEUDO!",
+ [(set VR128:$dst,
+ (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+ EFLAGS)))]>;
+ def CMOV_V2I64 : I<0, Pseudo,
+ (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+ "#CMOV_V2I64 PSEUDO!",
+ [(set VR128:$dst,
+ (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+ EFLAGS)))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE1 Instructions
+//===----------------------------------------------------------------------===//
+
+// Move Instructions. Register-to-register movss is not used for FR32
+// register copies because it's a partial register update; FsMOVAPSrr is
+// used instead. Register-to-register movss is not modeled as an INSERT_SUBREG
+// because INSERT_SUBREG requires that the insert be implementable in terms of
+// a copy, and just mentioned, we don't use movss for copies.
+let Constraints = "$src1 = $dst" in
+def MOVSSrr : SSI<0x10, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, FR32:$src2),
+ "movss\t{$src2, $dst|$dst, $src2}",
+ [(set (v4f32 VR128:$dst),
+ (movl VR128:$src1, (scalar_to_vector FR32:$src2)))]>;
+
+// Extract the low 32-bit value from one vector and insert it into another.
+let AddedComplexity = 15 in
+def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
+ (MOVSSrr (v4f32 VR128:$src1),
+ (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
+
+// Implicitly promote a 32-bit scalar to a vector.
+def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
+
+// Loading from memory automatically zeroing upper bits.
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOVSSrm : SSI<0x10, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+ "movss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (loadf32 addr:$src))]>;
+
+// MOVSSrm zeros the high parts of the register; represent this
+// with SUBREG_TO_REG.
+let AddedComplexity = 20 in {
+def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+ (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+ (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+}
+
+// Store scalar value to memory.
+def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
+ "movss\t{$src, $dst|$dst, $src}",
+ [(store FR32:$src, addr:$dst)]>;
+
+// Extract and store.
+def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (MOVSSmr addr:$dst,
+ (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+
+// Conversion instructions
+def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (fp_to_sint FR32:$src))]>;
+def CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
+def CVTSI2SSrr : SSI<0x2A, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+ "cvtsi2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (sint_to_fp GR32:$src))]>;
+def CVTSI2SSrm : SSI<0x2A, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+ "cvtsi2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
+
+// Match intrinsics which expect XMM operand(s).
+def CVTSS2SIrr: SSI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
+ "cvtss2si{l}\t{$src, $dst|$dst, $src}", []>;
+def CVTSS2SIrm: SSI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
+ "cvtss2si{l}\t{$src, $dst|$dst, $src}", []>;
+
+def Int_CVTSS2SIrr : SSI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "cvtss2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse_cvtss2si VR128:$src))]>;
+def Int_CVTSS2SIrm : SSI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
+ "cvtss2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse_cvtss2si
+ (load addr:$src)))]>;
+
+// Match intrinsics which expect MM and XMM operand(s).
+def Int_CVTPS2PIrr : PSI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvtps2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvtps2pi VR128:$src))]>;
+def Int_CVTPS2PIrm : PSI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+ "cvtps2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvtps2pi
+ (load addr:$src)))]>;
+def Int_CVTTPS2PIrr: PSI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvttps2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvttps2pi VR128:$src))]>;
+def Int_CVTTPS2PIrm: PSI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+ "cvttps2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvttps2pi
+ (load addr:$src)))]>;
+let Constraints = "$src1 = $dst" in {
+ def Int_CVTPI2PSrr : PSI<0x2A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR64:$src2),
+ "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
+ VR64:$src2))]>;
+ def Int_CVTPI2PSrm : PSI<0x2A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
+ "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
+ (load addr:$src2)))]>;
+}
+
+// Aliases for intrinsics
+def Int_CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (int_x86_sse_cvttss2si VR128:$src))]>;
+def Int_CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
+ "cvttss2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (int_x86_sse_cvttss2si(load addr:$src)))]>;
+
+let Constraints = "$src1 = $dst" in {
+ def Int_CVTSI2SSrr : SSI<0x2A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, GR32:$src2),
+ "cvtsi2ss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+ GR32:$src2))]>;
+ def Int_CVTSI2SSrm : SSI<0x2A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2),
+ "cvtsi2ss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+ (loadi32 addr:$src2)))]>;
+}
+
+// Comparison instructions
+let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
+ def CMPSSrr : SSIi8<0xC2, MRMSrcReg,
+ (outs FR32:$dst), (ins FR32:$src1, FR32:$src, SSECC:$cc),
+ "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+ def CMPSSrm : SSIi8<0xC2, MRMSrcMem,
+ (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, SSECC:$cc),
+ "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
+
+ // Accept explicit immediate argument form instead of comparison code.
+let isAsmParserOnly = 1 in {
+ def CMPSSrr_alt : SSIi8<0xC2, MRMSrcReg,
+ (outs FR32:$dst), (ins FR32:$src1, FR32:$src, i8imm:$src2),
+ "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
+let mayLoad = 1 in
+ def CMPSSrm_alt : SSIi8<0xC2, MRMSrcMem,
+ (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, i8imm:$src2),
+ "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
+}
+}
+
+let Defs = [EFLAGS] in {
+def UCOMISSrr: PSI<0x2E, MRMSrcReg, (outs), (ins FR32:$src1, FR32:$src2),
+ "ucomiss\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp FR32:$src1, FR32:$src2))]>;
+def UCOMISSrm: PSI<0x2E, MRMSrcMem, (outs), (ins FR32:$src1, f32mem:$src2),
+ "ucomiss\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp FR32:$src1, (loadf32 addr:$src2)))]>;
+
+def COMISSrr: PSI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "comiss\t{$src2, $src1|$src1, $src2}", []>;
+def COMISSrm: PSI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+ "comiss\t{$src2, $src1|$src1, $src2}", []>;
+
+} // Defs = [EFLAGS]
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let Constraints = "$src1 = $dst" in {
+ def Int_CMPSSrr : SSIi8<0xC2, MRMSrcReg,
+ (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src, SSECC:$cc),
+ "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ss
+ VR128:$src1,
+ VR128:$src, imm:$cc))]>;
+ def Int_CMPSSrm : SSIi8<0xC2, MRMSrcMem,
+ (outs VR128:$dst),
+ (ins VR128:$src1, f32mem:$src, SSECC:$cc),
+ "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
+ (load addr:$src), imm:$cc))]>;
+}
+
+let Defs = [EFLAGS] in {
+def Int_UCOMISSrr: PSI<0x2E, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "ucomiss\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ucomi (v4f32 VR128:$src1),
+ VR128:$src2))]>;
+def Int_UCOMISSrm: PSI<0x2E, MRMSrcMem, (outs),(ins VR128:$src1, f128mem:$src2),
+ "ucomiss\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ucomi (v4f32 VR128:$src1),
+ (load addr:$src2)))]>;
+
+def Int_COMISSrr: PSI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "comiss\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86comi (v4f32 VR128:$src1),
+ VR128:$src2))]>;
+def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+ "comiss\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86comi (v4f32 VR128:$src1),
+ (load addr:$src2)))]>;
+} // Defs = [EFLAGS]
+
+// Aliases of packed SSE1 instructions for scalar use. These all have names
+// that start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
+ canFoldAsLoad = 1 in
+ // FIXME: Set encoding to pseudo!
+def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "",
+ [(set FR32:$dst, fp32imm0)]>,
+ Requires<[HasSSE1]>, TB, OpSize;
+
+// Alias instruction to do FR32 reg-to-reg copy using movaps. Upper bits are
+// disregarded.
+let neverHasSideEffects = 1 in
+def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+ "movaps\t{$src, $dst|$dst, $src}", []>;
+
+// Alias instruction to load FR32 from f128mem using movaps. Upper bits are
+// disregarded.
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let Constraints = "$src1 = $dst" in {
+let isCommutable = 1 in {
+ def FsANDPSrr : PSI<0x54, MRMSrcReg, (outs FR32:$dst),
+ (ins FR32:$src1, FR32:$src2),
+ "andps\t{$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fand FR32:$src1, FR32:$src2))]>;
+ def FsORPSrr : PSI<0x56, MRMSrcReg, (outs FR32:$dst),
+ (ins FR32:$src1, FR32:$src2),
+ "orps\t{$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86for FR32:$src1, FR32:$src2))]>;
+ def FsXORPSrr : PSI<0x57, MRMSrcReg, (outs FR32:$dst),
+ (ins FR32:$src1, FR32:$src2),
+ "xorps\t{$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fxor FR32:$src1, FR32:$src2))]>;
+}
+
+def FsANDPSrm : PSI<0x54, MRMSrcMem, (outs FR32:$dst),
+ (ins FR32:$src1, f128mem:$src2),
+ "andps\t{$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fand FR32:$src1,
+ (memopfsf32 addr:$src2)))]>;
+def FsORPSrm : PSI<0x56, MRMSrcMem, (outs FR32:$dst),
+ (ins FR32:$src1, f128mem:$src2),
+ "orps\t{$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86for FR32:$src1,
+ (memopfsf32 addr:$src2)))]>;
+def FsXORPSrm : PSI<0x57, MRMSrcMem, (outs FR32:$dst),
+ (ins FR32:$src1, f128mem:$src2),
+ "xorps\t{$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fxor FR32:$src1,
+ (memopfsf32 addr:$src2)))]>;
+
+let neverHasSideEffects = 1 in {
+def FsANDNPSrr : PSI<0x55, MRMSrcReg,
+ (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
+ "andnps\t{$src2, $dst|$dst, $src2}", []>;
+let mayLoad = 1 in
+def FsANDNPSrm : PSI<0x55, MRMSrcMem,
+ (outs FR32:$dst), (ins FR32:$src1, f128mem:$src2),
+ "andnps\t{$src2, $dst|$dst, $src2}", []>;
+}
+}
+
+/// basic_sse1_fp_binop_rm - SSE1 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass basic_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, Intrinsic F32Int,
+ bit Commutable = 0> {
+ // Scalar operation, reg+reg.
+ def SSrr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, reg+mem.
+ def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+ (ins FR32:$src1, f32mem:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
+
+ // Vector operation, reg+reg.
+ def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, reg+mem.
+ def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
+
+ // Intrinsic operation, reg+reg.
+ def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]>;
+
+ // Intrinsic operation, reg+mem.
+ def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, ssmem:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F32Int VR128:$src1,
+ sse_load_f32:$src2))]>;
+}
+}
+
+// Arithmetic instructions
+defm ADD : basic_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>;
+defm MUL : basic_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>;
+defm SUB : basic_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>;
+defm DIV : basic_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>;
+
+/// sse1_fp_binop_rm - Other SSE1 binops
+///
+/// This multiclass is like basic_sse1_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form. Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,
+ Intrinsic F32Int,
+ Intrinsic V4F32Int,
+ bit Commutable = 0> {
+
+ // Scalar operation, reg+reg.
+ def SSrr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, reg+mem.
+ def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+ (ins FR32:$src1, f32mem:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
+
+ // Vector operation, reg+reg.
+ def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, reg+mem.
+ def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
+
+ // Intrinsic operation, reg+reg.
+ def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, reg+mem.
+ def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, ssmem:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F32Int VR128:$src1,
+ sse_load_f32:$src2))]>;
+
+ // Vector intrinsic operation, reg+reg.
+ def PSrr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (V4F32Int VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector intrinsic operation, reg+mem.
+ def PSrm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (V4F32Int VR128:$src1, (memopv4f32 addr:$src2)))]>;
+}
+}
+
+defm MAX : sse1_fp_binop_rm<0x5F, "max", X86fmax,
+ int_x86_sse_max_ss, int_x86_sse_max_ps>;
+defm MIN : sse1_fp_binop_rm<0x5D, "min", X86fmin,
+ int_x86_sse_min_ss, int_x86_sse_min_ps>;
+
+//===----------------------------------------------------------------------===//
+// SSE packed FP Instructions
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVAPSrr : PSI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOVAPSrm : PSI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (alignedloadv4f32 addr:$src))]>;
+
+def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movaps\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
+
+let neverHasSideEffects = 1 in
+def MOVUPSrr : PSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOVUPSrm : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (loadv4f32 addr:$src))]>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(store (v4f32 VR128:$src), addr:$dst)]>;
+
+// Intrinsic forms of MOVUPS load and store
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
+def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movups\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>;
+
+let Constraints = "$src1 = $dst" in {
+ let AddedComplexity = 20 in {
+ def MOVLPSrm : PSI<0x12, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ "movlps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (movlp VR128:$src1,
+ (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))]>;
+ def MOVHPSrm : PSI<0x16, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ "movhps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (movlhps VR128:$src1,
+ (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))]>;
+ } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+
+def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+ (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
+
+def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+ (iPTR 0))), addr:$dst)]>;
+
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhps\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract
+ (unpckh (bc_v2f64 (v4f32 VR128:$src)),
+ (undef)), (iPTR 0))), addr:$dst)]>;
+
+let Constraints = "$src1 = $dst" in {
+let AddedComplexity = 20 in {
+def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movlhps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (movlhps VR128:$src1, VR128:$src2)))]>;
+
+def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ "movhlps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>;
+} // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+let AddedComplexity = 20 in {
+def : Pat<(v4f32 (movddup VR128:$src, (undef))),
+ (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
+def : Pat<(v2i64 (movddup VR128:$src, (undef))),
+ (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
+}
+
+
+
+// Arithmetic
+
+/// sse1_fp_unop_rm - SSE1 unops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,
+ Intrinsic F32Int,
+ Intrinsic V4F32Int,
+ bit Commutable = 0> {
+ // Scalar operation, reg.
+ def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set FR32:$dst, (OpNode FR32:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, mem.
+ def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set FR32:$dst, (OpNode (load addr:$src)))]>, XS,
+ Requires<[HasSSE1, OptForSize]>;
+
+ // Vector operation, reg.
+ def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, mem.
+ def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>;
+
+ // Intrinsic operation, reg.
+ def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F32Int VR128:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, mem.
+ def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
+
+ // Vector intrinsic operation, reg
+ def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V4F32Int VR128:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector intrinsic operation, mem
+ def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))]>;
+}
+
+// Square root.
+defm SQRT : sse1_fp_unop_rm<0x51, "sqrt", fsqrt,
+ int_x86_sse_sqrt_ss, int_x86_sse_sqrt_ps>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_rm<0x52, "rsqrt", X86frsqrt,
+ int_x86_sse_rsqrt_ss, int_x86_sse_rsqrt_ps>;
+defm RCP : sse1_fp_unop_rm<0x53, "rcp", X86frcp,
+ int_x86_sse_rcp_ss, int_x86_sse_rcp_ps>;
+
+// Logical
+let Constraints = "$src1 = $dst" in {
+ let isCommutable = 1 in {
+ def ANDPSrr : PSI<0x54, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "andps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64
+ (and VR128:$src1, VR128:$src2)))]>;
+ def ORPSrr : PSI<0x56, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "orps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64
+ (or VR128:$src1, VR128:$src2)))]>;
+ def XORPSrr : PSI<0x57, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "xorps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64
+ (xor VR128:$src1, VR128:$src2)))]>;
+ }
+
+ def ANDPSrm : PSI<0x54, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "andps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (and (bc_v2i64 (v4f32 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>;
+ def ORPSrm : PSI<0x56, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "orps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (or (bc_v2i64 (v4f32 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>;
+ def XORPSrm : PSI<0x57, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "xorps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (xor (bc_v2i64 (v4f32 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>;
+ def ANDNPSrr : PSI<0x55, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "andnps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (and (xor VR128:$src1,
+ (bc_v2i64 (v4i32 immAllOnesV))),
+ VR128:$src2)))]>;
+ def ANDNPSrm : PSI<0x55, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1,f128mem:$src2),
+ "andnps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (and (xor (bc_v2i64 (v4f32 VR128:$src1)),
+ (bc_v2i64 (v4i32 immAllOnesV))),
+ (memopv2i64 addr:$src2))))]>;
+}
+
+let Constraints = "$src1 = $dst" in {
+ def CMPPSrri : PSIi8<0xC2, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
+ "cmp${cc}ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+ VR128:$src, imm:$cc))]>;
+ def CMPPSrmi : PSIi8<0xC2, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
+ "cmp${cc}ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+ (memop addr:$src), imm:$cc))]>;
+
+ // Accept explicit immediate argument form instead of comparison code.
+let isAsmParserOnly = 1 in {
+ def CMPPSrri_alt : PSIi8<0xC2, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src, i8imm:$src2),
+ "cmpps\t{$src2, $src, $dst|$dst, $src, $src}", []>;
+ def CMPPSrmi_alt : PSIi8<0xC2, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, i8imm:$src2),
+ "cmpps\t{$src2, $src, $dst|$dst, $src, $src}", []>;
+}
+}
+def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+ (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+ (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+
+// Shuffle and unpack instructions
+let Constraints = "$src1 = $dst" in {
+ let isConvertibleToThreeAddress = 1 in // Convert to pshufd
+ def SHUFPSrri : PSIi8<0xC6, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1,
+ VR128:$src2, i8imm:$src3),
+ "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (v4f32 (shufp:$src3 VR128:$src1, VR128:$src2)))]>;
+ def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1,
+ f128mem:$src2, i8imm:$src3),
+ "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (v4f32 (shufp:$src3
+ VR128:$src1, (memopv4f32 addr:$src2))))]>;
+
+ let AddedComplexity = 10 in {
+ def UNPCKHPSrr : PSI<0x15, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "unpckhps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (unpckh VR128:$src1, VR128:$src2)))]>;
+ def UNPCKHPSrm : PSI<0x15, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "unpckhps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (unpckh VR128:$src1,
+ (memopv4f32 addr:$src2))))]>;
+
+ def UNPCKLPSrr : PSI<0x14, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "unpcklps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (unpckl VR128:$src1, VR128:$src2)))]>;
+ def UNPCKLPSrm : PSI<0x14, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "unpcklps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckl VR128:$src1, (memopv4f32 addr:$src2)))]>;
+ } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+// Mask creation
+def MOVMSKPSrr : PSI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "movmskps\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse_movmsk_ps VR128:$src))]>;
+def MOVMSKPDrr : PDI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "movmskpd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_movmsk_pd VR128:$src))]>;
+
+// Prefetch intrinsic.
+def PREFETCHT0 : PSI<0x18, MRM1m, (outs), (ins i8mem:$src),
+ "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3))]>;
+def PREFETCHT1 : PSI<0x18, MRM2m, (outs), (ins i8mem:$src),
+ "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2))]>;
+def PREFETCHT2 : PSI<0x18, MRM3m, (outs), (ins i8mem:$src),
+ "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1))]>;
+def PREFETCHNTA : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
+ "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0))]>;
+
+// Non-temporal stores
+def MOVNTPSmr_Int : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
+
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntps\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
+
+def MOVNTDQ_64mr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>;
+
+def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "movnti\t{$src, $dst|$dst, $src}",
+ [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
+ TB, Requires<[HasSSE2]>;
+
+def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "movnti\t{$src, $dst|$dst, $src}",
+ [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
+ TB, Requires<[HasSSE2]>;
+}
+
+// Load, store, and memory fence
+def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
+ TB, Requires<[HasSSE1]>;
+
+// MXCSR register
+def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+ "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>;
+def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+ "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+// FIXME: Change encoding to pseudo!
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isCodeGenOnly = 1 in {
+def V_SET0PS : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+def V_SET0PD : PDI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, (v2f64 immAllZerosV))]>;
+let ExeDomain = SSEPackedInt in
+def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, (v4i32 immAllZerosV))]>;
+}
+
+def : Pat<(v2i64 immAllZerosV), (V_SET0PI)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0PI)>;
+def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>;
+
+def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+ (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 Instructions
+//===---------------------------------------------------------------------===//
+
+// Move Instructions. Register-to-register movsd is not used for FR64
+// register copies because it's a partial register update; FsMOVAPDrr is
+// used instead. Register-to-register movsd is not modeled as an INSERT_SUBREG
+// because INSERT_SUBREG requires that the insert be implementable in terms of
+// a copy, and just mentioned, we don't use movsd for copies.
+let Constraints = "$src1 = $dst" in
+def MOVSDrr : SDI<0x10, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, FR64:$src2),
+ "movsd\t{$src2, $dst|$dst, $src2}",
+ [(set (v2f64 VR128:$dst),
+ (movl VR128:$src1, (scalar_to_vector FR64:$src2)))]>;
+
+// Extract the low 64-bit value from one vector and insert it into another.
+let AddedComplexity = 15 in
+def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
+ (MOVSDrr (v2f64 VR128:$src1),
+ (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
+
+// Implicitly promote a 64-bit scalar to a vector.
+def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+ (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
+
+// Loading from memory automatically zeroing upper bits.
+let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 20 in
+def MOVSDrm : SDI<0x10, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
+ "movsd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (loadf64 addr:$src))]>;
+
+// MOVSDrm zeros the high parts of the register; represent this
+// with SUBREG_TO_REG.
+let AddedComplexity = 20 in {
+def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+ (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+ (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+ (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+def : Pat<(v2f64 (X86vzload addr:$src)),
+ (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+}
+
+// Store scalar value to memory.
+def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
+ "movsd\t{$src, $dst|$dst, $src}",
+ [(store FR64:$src, addr:$dst)]>;
+
+// Extract and store.
+def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+ addr:$dst),
+ (MOVSDmr addr:$dst,
+ (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+
+// Conversion instructions
+def CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src),
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (fp_to_sint FR64:$src))]>;
+def CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f64mem:$src),
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
+def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
+ "cvtsd2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (fround FR64:$src))]>;
+def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
+ "cvtsd2ss\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD,
+ Requires<[HasSSE2, OptForSize]>;
+def CVTSI2SDrr : SDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR32:$src),
+ "cvtsi2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (sint_to_fp GR32:$src))]>;
+def CVTSI2SDrm : SDI<0x2A, MRMSrcMem, (outs FR64:$dst), (ins i32mem:$src),
+ "cvtsi2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
+
+def CVTPD2DQrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTPD2DQrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
+def CVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
+def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTDQ2PSrr : PSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtdq2ps\t{$src, $dst|$dst, $src}", []>;
+def CVTDQ2PSrm : PSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtdq2ps\t{$src, $dst|$dst, $src}", []>;
+def COMISDrr: PDI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "comisd\t{$src2, $src1|$src1, $src2}", []>;
+def COMISDrm: PDI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+ "comisd\t{$src2, $src1|$src1, $src2}", []>;
+
+// SSE2 instructions with XS prefix
+def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
+ "cvtss2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (fextend FR32:$src))]>, XS,
+ Requires<[HasSSE2]>;
+def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
+ "cvtss2sd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
+ Requires<[HasSSE2, OptForSize]>;
+
+def : Pat<(extloadf32 addr:$src),
+ (CVTSS2SDrr (MOVSSrm addr:$src))>,
+ Requires<[HasSSE2, OptForSpeed]>;
+
+// Match intrinsics which expect XMM operand(s).
+def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "cvtsd2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_cvtsd2si VR128:$src))]>;
+def Int_CVTSD2SIrm : SDI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f128mem:$src),
+ "cvtsd2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_cvtsd2si
+ (load addr:$src)))]>;
+
+// Match intrinsics which expect MM and XMM operand(s).
+def Int_CVTPD2PIrr : PDI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvtpd2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvtpd2pi VR128:$src))]>;
+def Int_CVTPD2PIrm : PDI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+ "cvtpd2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvtpd2pi
+ (memop addr:$src)))]>;
+def Int_CVTTPD2PIrr: PDI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+ "cvttpd2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvttpd2pi VR128:$src))]>;
+def Int_CVTTPD2PIrm: PDI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+ "cvttpd2pi\t{$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (int_x86_sse_cvttpd2pi
+ (memop addr:$src)))]>;
+def Int_CVTPI2PDrr : PDI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
+ "cvtpi2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cvtpi2pd VR64:$src))]>;
+def Int_CVTPI2PDrm : PDI<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "cvtpi2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cvtpi2pd
+ (load addr:$src)))]>;
+
+// Aliases for intrinsics
+def Int_CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (int_x86_sse2_cvttsd2si VR128:$src))]>;
+def Int_CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f128mem:$src),
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_cvttsd2si
+ (load addr:$src)))]>;
+
+// Comparison instructions
+let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
+ def CMPSDrr : SDIi8<0xC2, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, FR64:$src, SSECC:$cc),
+ "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+ def CMPSDrm : SDIi8<0xC2, MRMSrcMem,
+ (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, SSECC:$cc),
+ "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
+
+ // Accept explicit immediate argument form instead of comparison code.
+let isAsmParserOnly = 1 in {
+ def CMPSDrr_alt : SDIi8<0xC2, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, FR64:$src, i8imm:$src2),
+ "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
+let mayLoad = 1 in
+ def CMPSDrm_alt : SDIi8<0xC2, MRMSrcMem,
+ (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, i8imm:$src2),
+ "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
+}
+}
+
+let Defs = [EFLAGS] in {
+def UCOMISDrr: PDI<0x2E, MRMSrcReg, (outs), (ins FR64:$src1, FR64:$src2),
+ "ucomisd\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp FR64:$src1, FR64:$src2))]>;
+def UCOMISDrm: PDI<0x2E, MRMSrcMem, (outs), (ins FR64:$src1, f64mem:$src2),
+ "ucomisd\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86cmp FR64:$src1, (loadf64 addr:$src2)))]>;
+} // Defs = [EFLAGS]
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let Constraints = "$src1 = $dst" in {
+ def Int_CMPSDrr : SDIi8<0xC2, MRMSrcReg,
+ (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src, SSECC:$cc),
+ "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
+ VR128:$src, imm:$cc))]>;
+ def Int_CMPSDrm : SDIi8<0xC2, MRMSrcMem,
+ (outs VR128:$dst),
+ (ins VR128:$src1, f64mem:$src, SSECC:$cc),
+ "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
+ (load addr:$src), imm:$cc))]>;
+}
+
+let Defs = [EFLAGS] in {
+def Int_UCOMISDrr: PDI<0x2E, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "ucomisd\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ucomi (v2f64 VR128:$src1),
+ VR128:$src2))]>;
+def Int_UCOMISDrm: PDI<0x2E, MRMSrcMem, (outs),(ins VR128:$src1, f128mem:$src2),
+ "ucomisd\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ucomi (v2f64 VR128:$src1),
+ (load addr:$src2)))]>;
+
+def Int_COMISDrr: PDI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "comisd\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86comi (v2f64 VR128:$src1),
+ VR128:$src2))]>;
+def Int_COMISDrm: PDI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+ "comisd\t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86comi (v2f64 VR128:$src1),
+ (load addr:$src2)))]>;
+} // Defs = [EFLAGS]
+
+// Aliases of packed SSE2 instructions for scalar use. These all have names
+// that start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
+ canFoldAsLoad = 1 in
+def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "",
+ [(set FR64:$dst, fpimm0)]>,
+ Requires<[HasSSE2]>, TB, OpSize;
+
+// Alias instruction to do FR64 reg-to-reg copy using movapd. Upper bits are
+// disregarded.
+let neverHasSideEffects = 1 in
+def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+ "movapd\t{$src, $dst|$dst, $src}", []>;
+
+// Alias instruction to load FR64 from f128mem using movapd. Upper bits are
+// disregarded.
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let Constraints = "$src1 = $dst" in {
+let isCommutable = 1 in {
+ def FsANDPDrr : PDI<0x54, MRMSrcReg, (outs FR64:$dst),
+ (ins FR64:$src1, FR64:$src2),
+ "andpd\t{$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86fand FR64:$src1, FR64:$src2))]>;
+ def FsORPDrr : PDI<0x56, MRMSrcReg, (outs FR64:$dst),
+ (ins FR64:$src1, FR64:$src2),
+ "orpd\t{$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86for FR64:$src1, FR64:$src2))]>;
+ def FsXORPDrr : PDI<0x57, MRMSrcReg, (outs FR64:$dst),
+ (ins FR64:$src1, FR64:$src2),
+ "xorpd\t{$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86fxor FR64:$src1, FR64:$src2))]>;
+}
+
+def FsANDPDrm : PDI<0x54, MRMSrcMem, (outs FR64:$dst),
+ (ins FR64:$src1, f128mem:$src2),
+ "andpd\t{$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86fand FR64:$src1,
+ (memopfsf64 addr:$src2)))]>;
+def FsORPDrm : PDI<0x56, MRMSrcMem, (outs FR64:$dst),
+ (ins FR64:$src1, f128mem:$src2),
+ "orpd\t{$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86for FR64:$src1,
+ (memopfsf64 addr:$src2)))]>;
+def FsXORPDrm : PDI<0x57, MRMSrcMem, (outs FR64:$dst),
+ (ins FR64:$src1, f128mem:$src2),
+ "xorpd\t{$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86fxor FR64:$src1,
+ (memopfsf64 addr:$src2)))]>;
+
+let neverHasSideEffects = 1 in {
+def FsANDNPDrr : PDI<0x55, MRMSrcReg,
+ (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+ "andnpd\t{$src2, $dst|$dst, $src2}", []>;
+let mayLoad = 1 in
+def FsANDNPDrm : PDI<0x55, MRMSrcMem,
+ (outs FR64:$dst), (ins FR64:$src1, f128mem:$src2),
+ "andnpd\t{$src2, $dst|$dst, $src2}", []>;
+}
+}
+
+/// basic_sse2_fp_binop_rm - SSE2 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass basic_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, Intrinsic F64Int,
+ bit Commutable = 0> {
+ // Scalar operation, reg+reg.
+ def SDrr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, reg+mem.
+ def SDrm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
+ (ins FR64:$src1, f64mem:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
+
+ // Vector operation, reg+reg.
+ def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, reg+mem.
+ def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
+
+ // Intrinsic operation, reg+reg.
+ def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]>;
+
+ // Intrinsic operation, reg+mem.
+ def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, sdmem:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F64Int VR128:$src1,
+ sse_load_f64:$src2))]>;
+}
+}
+
+// Arithmetic instructions
+defm ADD : basic_sse2_fp_binop_rm<0x58, "add", fadd, int_x86_sse2_add_sd, 1>;
+defm MUL : basic_sse2_fp_binop_rm<0x59, "mul", fmul, int_x86_sse2_mul_sd, 1>;
+defm SUB : basic_sse2_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse2_sub_sd>;
+defm DIV : basic_sse2_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse2_div_sd>;
+
+/// sse2_fp_binop_rm - Other SSE2 binops
+///
+/// This multiclass is like basic_sse2_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form. Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,
+ Intrinsic F64Int,
+ Intrinsic V2F64Int,
+ bit Commutable = 0> {
+
+ // Scalar operation, reg+reg.
+ def SDrr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, reg+mem.
+ def SDrm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
+ (ins FR64:$src1, f64mem:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
+
+ // Vector operation, reg+reg.
+ def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, reg+mem.
+ def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
+
+ // Intrinsic operation, reg+reg.
+ def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, reg+mem.
+ def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, sdmem:$src2),
+ !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F64Int VR128:$src1,
+ sse_load_f64:$src2))]>;
+
+ // Vector intrinsic operation, reg+reg.
+ def PDrr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (V2F64Int VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector intrinsic operation, reg+mem.
+ def PDrm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (V2F64Int VR128:$src1,
+ (memopv2f64 addr:$src2)))]>;
+}
+}
+
+defm MAX : sse2_fp_binop_rm<0x5F, "max", X86fmax,
+ int_x86_sse2_max_sd, int_x86_sse2_max_pd>;
+defm MIN : sse2_fp_binop_rm<0x5D, "min", X86fmin,
+ int_x86_sse2_min_sd, int_x86_sse2_min_pd>;
+
+//===---------------------------------------------------------------------===//
+// SSE packed FP Instructions
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVAPDrr : PDI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOVAPDrm : PDI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (alignedloadv2f64 addr:$src))]>;
+
+def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movapd\t{$src, $dst|$dst, $src}",
+ [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
+
+let neverHasSideEffects = 1 in
+def MOVUPDrr : PDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1 in
+def MOVUPDrm : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (loadv2f64 addr:$src))]>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(store (v2f64 VR128:$src), addr:$dst)]>;
+
+// Intrinsic forms of MOVUPD load and store
+def MOVUPDrm_Int : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>;
+def MOVUPDmr_Int : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movupd\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>;
+
+let Constraints = "$src1 = $dst" in {
+ let AddedComplexity = 20 in {
+ def MOVLPDrm : PDI<0x12, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ "movlpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (movlp VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))))]>;
+ def MOVHPDrm : PDI<0x16, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ "movhpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (movlhps VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)))))]>;
+ } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movlpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (v2f64 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+ "movhpd\t{$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract
+ (v2f64 (unpckh VR128:$src, (undef))),
+ (iPTR 0))), addr:$dst)]>;
+
+// SSE2 instructions without OpSize prefix
+def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtdq2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
+ TB, Requires<[HasSSE2]>;
+def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "cvtdq2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
+ (bitconvert (memopv2i64 addr:$src))))]>,
+ TB, Requires<[HasSSE2]>;
+
+// SSE2 instructions with XS prefix
+def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
+ XS, Requires<[HasSSE2]>;
+def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "cvtdq2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
+ (bitconvert (memopv2i64 addr:$src))))]>,
+ XS, Requires<[HasSSE2]>;
+
+def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>;
+def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2dq
+ (memop addr:$src)))]>;
+// SSE2 packed instructions with XS prefix
+def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}", []>;
+
+def Int_CVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (int_x86_sse2_cvttps2dq VR128:$src))]>,
+ XS, Requires<[HasSSE2]>;
+def Int_CVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvttps2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+ (memop addr:$src)))]>,
+ XS, Requires<[HasSSE2]>;
+
+// SSE2 packed instructions with XD prefix
+def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+ XD, Requires<[HasSSE2]>;
+def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
+ (memop addr:$src)))]>,
+ XD, Requires<[HasSSE2]>;
+
+def Int_CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
+def Int_CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
+ "cvttpd2dq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+ (memop addr:$src)))]>;
+
+// SSE2 instructions without OpSize prefix
+def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
+def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
+
+def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
+ TB, Requires<[HasSSE2]>;
+def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ "cvtps2pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2pd
+ (load addr:$src)))]>,
+ TB, Requires<[HasSSE2]>;
+
+def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
+def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
+
+
+def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
+def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "cvtpd2ps\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
+ (memop addr:$src)))]>;
+
+// Match intrinsics which expect XMM operand(s).
+// Aliases for intrinsics
+let Constraints = "$src1 = $dst" in {
+def Int_CVTSI2SDrr: SDI<0x2A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, GR32:$src2),
+ "cvtsi2sd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
+ GR32:$src2))]>;
+def Int_CVTSI2SDrm: SDI<0x2A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2),
+ "cvtsi2sd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
+ (loadi32 addr:$src2)))]>;
+def Int_CVTSD2SSrr: SDI<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
+ VR128:$src2))]>;
+def Int_CVTSD2SSrm: SDI<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+ "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
+ (load addr:$src2)))]>;
+def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+ VR128:$src2))]>, XS,
+ Requires<[HasSSE2]>;
+def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
+ "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+ (load addr:$src2)))]>, XS,
+ Requires<[HasSSE2]>;
+}
+
+// Arithmetic
+
+/// sse2_fp_unop_rm - SSE2 unops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse2_fp_unop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,
+ Intrinsic F64Int,
+ Intrinsic V2F64Int,
+ bit Commutable = 0> {
+ // Scalar operation, reg.
+ def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+ !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+ [(set FR64:$dst, (OpNode FR64:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, mem.
+ def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
+ !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+ [(set FR64:$dst, (OpNode (load addr:$src)))]>;
+
+ // Vector operation, reg.
+ def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, mem.
+ def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>;
+
+ // Intrinsic operation, reg.
+ def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F64Int VR128:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, mem.
+ def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
+ !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
+
+ // Vector intrinsic operation, reg
+ def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V2F64Int VR128:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector intrinsic operation, mem
+ def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))]>;
+}
+
+// Square root.
+defm SQRT : sse2_fp_unop_rm<0x51, "sqrt", fsqrt,
+ int_x86_sse2_sqrt_sd, int_x86_sse2_sqrt_pd>;
+
+// There is no f64 version of the reciprocal approximation instructions.
+
+// Logical
+let Constraints = "$src1 = $dst" in {
+ let isCommutable = 1 in {
+ def ANDPDrr : PDI<0x54, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "andpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (and (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (v2f64 VR128:$src2))))]>;
+ def ORPDrr : PDI<0x56, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "orpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (or (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (v2f64 VR128:$src2))))]>;
+ def XORPDrr : PDI<0x57, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "xorpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (xor (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (v2f64 VR128:$src2))))]>;
+ }
+
+ def ANDPDrm : PDI<0x54, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "andpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (and (bc_v2i64 (v2f64 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>;
+ def ORPDrm : PDI<0x56, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "orpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (or (bc_v2i64 (v2f64 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>;
+ def XORPDrm : PDI<0x57, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "xorpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (xor (bc_v2i64 (v2f64 VR128:$src1)),
+ (memopv2i64 addr:$src2)))]>;
+ def ANDNPDrr : PDI<0x55, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "andnpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+ (bc_v2i64 (v2f64 VR128:$src2))))]>;
+ def ANDNPDrm : PDI<0x55, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1,f128mem:$src2),
+ "andnpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+ (memopv2i64 addr:$src2)))]>;
+}
+
+let Constraints = "$src1 = $dst" in {
+ def CMPPDrri : PDIi8<0xC2, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
+ "cmp${cc}pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
+ VR128:$src, imm:$cc))]>;
+ def CMPPDrmi : PDIi8<0xC2, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
+ "cmp${cc}pd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
+ (memop addr:$src), imm:$cc))]>;
+
+ // Accept explicit immediate argument form instead of comparison code.
+let isAsmParserOnly = 1 in {
+ def CMPPDrri_alt : PDIi8<0xC2, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src, i8imm:$src2),
+ "cmppd\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
+ def CMPPDrmi_alt : PDIi8<0xC2, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, i8imm:$src2),
+ "cmppd\t{$src2, $src, $dst|$dst, $src, $src2}", []>;
+}
+}
+def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+ (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+ (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+// Shuffle and unpack instructions
+let Constraints = "$src1 = $dst" in {
+ def SHUFPDrri : PDIi8<0xC6, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (v2f64 (shufp:$src3 VR128:$src1, VR128:$src2)))]>;
+ def SHUFPDrmi : PDIi8<0xC6, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1,
+ f128mem:$src2, i8imm:$src3),
+ "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (v2f64 (shufp:$src3
+ VR128:$src1, (memopv2f64 addr:$src2))))]>;
+
+ let AddedComplexity = 10 in {
+ def UNPCKHPDrr : PDI<0x15, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "unpckhpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (unpckh VR128:$src1, VR128:$src2)))]>;
+ def UNPCKHPDrm : PDI<0x15, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "unpckhpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (unpckh VR128:$src1,
+ (memopv2f64 addr:$src2))))]>;
+
+ def UNPCKLPDrr : PDI<0x14, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "unpcklpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (unpckl VR128:$src1, VR128:$src2)))]>;
+ def UNPCKLPDrm : PDI<0x14, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "unpcklpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckl VR128:$src1, (memopv2f64 addr:$src2)))]>;
+ } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+
+//===---------------------------------------------------------------------===//
+// SSE integer instructions
+let ExeDomain = SSEPackedInt in {
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, mayLoad = 1 in
+def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
+let mayStore = 1 in
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movdqa\t{$src, $dst|$dst, $src}",
+ [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
+let canFoldAsLoad = 1, mayLoad = 1 in
+def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
+ XS, Requires<[HasSSE2]>;
+let mayStore = 1 in
+def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
+ XS, Requires<[HasSSE2]>;
+
+// Intrinsic forms of MOVDQU load and store
+let canFoldAsLoad = 1 in
+def MOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>,
+ XS, Requires<[HasSSE2]>;
+def MOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movdqu\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
+ XS, Requires<[HasSSE2]>;
+
+let Constraints = "$src1 = $dst" in {
+
+multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+ bit Commutable = 0> {
+ def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1,
+ (bitconvert (memopv2i64
+ addr:$src2))))]>;
+}
+
+multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr,
+ Intrinsic IntId, Intrinsic IntId2> {
+ def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
+ def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1,
+ (bitconvert (memopv2i64 addr:$src2))))]>;
+ def ri : PDIi8<opc2, ImmForm, (outs VR128:$dst),
+ (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId2 VR128:$src1, (i32 imm:$src2)))]>;
+}
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, bit Commutable = 0> {
+ def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpVT (OpNode VR128:$src1,
+ (bitconvert (memopv2i64 addr:$src2)))))]>;
+}
+
+/// PDI_binop_rm_v2i64 - Simple SSE2 binary operator whose type is v2i64.
+///
+/// FIXME: we could eliminate this and use PDI_binop_rm instead if tblgen knew
+/// to collapse (bitconvert VT to VT) into its operand.
+///
+multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ bit Commutable = 0> {
+ def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1,
+ (memopv2i64 addr:$src2)))]>;
+}
+
+} // Constraints = "$src1 = $dst"
+} // ExeDomain = SSEPackedInt
+
+// 128-bit Integer Arithmetic
+
+defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, 1>;
+defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, 1>;
+defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, 1>;
+defm PADDQ : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>;
+
+defm PADDSB : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 1>;
+defm PADDSW : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, 1>;
+defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 1>;
+defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 1>;
+
+defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8>;
+defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16>;
+defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32>;
+defm PSUBQ : PDI_binop_rm_v2i64<0xFB, "psubq", sub>;
+
+defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b>;
+defm PSUBSW : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w>;
+defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b>;
+defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w>;
+
+defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, 1>;
+
+defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, 1>;
+defm PMULHW : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w , 1>;
+defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq, 1>;
+
+defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 1>;
+
+defm PAVGB : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 1>;
+defm PAVGW : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 1>;
+
+
+defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, 1>;
+defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, 1>;
+defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, 1>;
+defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>;
+defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>;
+
+
+defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+ int_x86_sse2_psll_w, int_x86_sse2_pslli_w>;
+defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+ int_x86_sse2_psll_d, int_x86_sse2_pslli_d>;
+defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+ int_x86_sse2_psll_q, int_x86_sse2_pslli_q>;
+
+defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+ int_x86_sse2_psrl_w, int_x86_sse2_psrli_w>;
+defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+ int_x86_sse2_psrl_d, int_x86_sse2_psrli_d>;
+defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+ int_x86_sse2_psrl_q, int_x86_sse2_psrli_q>;
+
+defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+ int_x86_sse2_psra_w, int_x86_sse2_psrai_w>;
+defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+ int_x86_sse2_psra_d, int_x86_sse2_psrai_d>;
+
+// 128-bit logical shifts.
+let Constraints = "$src1 = $dst", neverHasSideEffects = 1,
+ ExeDomain = SSEPackedInt in {
+ def PSLLDQri : PDIi8<0x73, MRM7r,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "pslldq\t{$src2, $dst|$dst, $src2}", []>;
+ def PSRLDQri : PDIi8<0x73, MRM3r,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "psrldq\t{$src2, $dst|$dst, $src2}", []>;
+ // PSRADQri doesn't exist in SSE[1-3].
+}
+
+let Predicates = [HasSSE2] in {
+ def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
+ (v2i64 (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
+ def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
+ (v2i64 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
+ def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2),
+ (v2i64 (PSLLDQri VR128:$src1, imm:$src2))>;
+ def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2),
+ (v2i64 (PSRLDQri VR128:$src1, imm:$src2))>;
+ def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
+ (v2f64 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
+
+ // Shift up / down and insert zero's.
+ def : Pat<(v2i64 (X86vshl VR128:$src, (i8 imm:$amt))),
+ (v2i64 (PSLLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
+ def : Pat<(v2i64 (X86vshr VR128:$src, (i8 imm:$amt))),
+ (v2i64 (PSRLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
+}
+
+// Logical
+defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>;
+defm POR : PDI_binop_rm_v2i64<0xEB, "por" , or , 1>;
+defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>;
+
+let Constraints = "$src1 = $dst", ExeDomain = SSEPackedInt in {
+ def PANDNrr : PDI<0xDF, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "pandn\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+ VR128:$src2)))]>;
+
+ def PANDNrm : PDI<0xDF, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "pandn\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+ (memopv2i64 addr:$src2))))]>;
+}
+
+// SSE2 Integer comparison
+defm PCMPEQB : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b>;
+defm PCMPEQW : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w>;
+defm PCMPEQD : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d>;
+defm PCMPGTB : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>;
+defm PCMPGTW : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>;
+defm PCMPGTD : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>;
+
+def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
+ (PCMPEQBrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))),
+ (PCMPEQBrm VR128:$src1, addr:$src2)>;
+def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
+ (PCMPEQWrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))),
+ (PCMPEQWrm VR128:$src1, addr:$src2)>;
+def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
+ (PCMPEQDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))),
+ (PCMPEQDrm VR128:$src1, addr:$src2)>;
+
+def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
+ (PCMPGTBrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))),
+ (PCMPGTBrm VR128:$src1, addr:$src2)>;
+def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
+ (PCMPGTWrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))),
+ (PCMPGTWrm VR128:$src1, addr:$src2)>;
+def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
+ (PCMPGTDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
+ (PCMPGTDrm VR128:$src1, addr:$src2)>;
+
+
+// Pack instructions
+defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>;
+defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
+defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>;
+
+let ExeDomain = SSEPackedInt in {
+
+// Shuffle and unpack instructions
+let AddedComplexity = 5 in {
+def PSHUFDri : PDIi8<0x70, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+ "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v4i32 (pshufd:$src2
+ VR128:$src1, (undef))))]>;
+def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
+ (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+ "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v4i32 (pshufd:$src2
+ (bc_v4i32 (memopv2i64 addr:$src1)),
+ (undef))))]>;
+}
+
+// SSE2 with ImmT == Imm8 and XS prefix.
+def PSHUFHWri : Ii8<0x70, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+ "pshufhw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v8i16 (pshufhw:$src2 VR128:$src1,
+ (undef))))]>,
+ XS, Requires<[HasSSE2]>;
+def PSHUFHWmi : Ii8<0x70, MRMSrcMem,
+ (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+ "pshufhw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v8i16 (pshufhw:$src2
+ (bc_v8i16 (memopv2i64 addr:$src1)),
+ (undef))))]>,
+ XS, Requires<[HasSSE2]>;
+
+// SSE2 with ImmT == Imm8 and XD prefix.
+def PSHUFLWri : Ii8<0x70, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+ "pshuflw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v8i16 (pshuflw:$src2 VR128:$src1,
+ (undef))))]>,
+ XD, Requires<[HasSSE2]>;
+def PSHUFLWmi : Ii8<0x70, MRMSrcMem,
+ (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+ "pshuflw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v8i16 (pshuflw:$src2
+ (bc_v8i16 (memopv2i64 addr:$src1)),
+ (undef))))]>,
+ XD, Requires<[HasSSE2]>;
+
+
+let Constraints = "$src1 = $dst" in {
+ def PUNPCKLBWrr : PDI<0x60, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpcklbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v16i8 (unpckl VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKLBWrm : PDI<0x60, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpcklbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckl VR128:$src1,
+ (bc_v16i8 (memopv2i64 addr:$src2))))]>;
+ def PUNPCKLWDrr : PDI<0x61, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpcklwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v8i16 (unpckl VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKLWDrm : PDI<0x61, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpcklwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckl VR128:$src1,
+ (bc_v8i16 (memopv2i64 addr:$src2))))]>;
+ def PUNPCKLDQrr : PDI<0x62, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpckldq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4i32 (unpckl VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKLDQrm : PDI<0x62, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpckldq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckl VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2))))]>;
+ def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpcklqdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (unpckl VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpcklqdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (unpckl VR128:$src1,
+ (memopv2i64 addr:$src2))))]>;
+
+ def PUNPCKHBWrr : PDI<0x68, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpckhbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v16i8 (unpckh VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKHBWrm : PDI<0x68, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpckhbw\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckh VR128:$src1,
+ (bc_v16i8 (memopv2i64 addr:$src2))))]>;
+ def PUNPCKHWDrr : PDI<0x69, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpckhwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v8i16 (unpckh VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKHWDrm : PDI<0x69, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpckhwd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckh VR128:$src1,
+ (bc_v8i16 (memopv2i64 addr:$src2))))]>;
+ def PUNPCKHDQrr : PDI<0x6A, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpckhdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4i32 (unpckh VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKHDQrm : PDI<0x6A, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpckhdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (unpckh VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2))))]>;
+ def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "punpckhqdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (unpckh VR128:$src1, VR128:$src2)))]>;
+ def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ "punpckhqdq\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (unpckh VR128:$src1,
+ (memopv2i64 addr:$src2))))]>;
+}
+
+// Extract / Insert
+def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+ (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
+ imm:$src2))]>;
+let Constraints = "$src1 = $dst" in {
+ def PINSRWrri : PDIi8<0xC4, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1,
+ GR32:$src2, i32i8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))]>;
+ def PINSRWrmi : PDIi8<0xC4, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1,
+ i16mem:$src2, i32i8imm:$src3),
+ "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
+ imm:$src3))]>;
+}
+
+// Mask creation
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+ "pmovmskb\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
+
+// Conditional store
+let Uses = [EDI] in
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
+
+let Uses = [RDI] in
+def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+ "maskmovdqu\t{$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
+
+} // ExeDomain = SSEPackedInt
+
+// Non-temporal stores
+def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
+let ExeDomain = SSEPackedInt in
+def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
+def MOVNTImr_Int : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "movnti\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>,
+ TB, Requires<[HasSSE2]>;
+
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntpd\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
+
+let ExeDomain = SSEPackedInt in
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
+}
+
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+ "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
+ TB, Requires<[HasSSE2]>;
+
+// Load, store, and memory fence
+def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
+ "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
+ "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
+
+// Pause. This "instruction" is encoded as "rep; nop", so even though it
+// was introduced with SSE2, it's backward compatible.
+def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP;
+
+//TODO: custom lower this so as to never even generate the noop
+def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
+ (i8 0)), (NOOP)>;
+def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
+def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
+def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm),
+ (i8 1)), (MFENCE)>;
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-ones value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+ isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
+ // FIXME: Change encoding to pseudo.
+ def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+
+def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector GR32:$src)))]>;
+def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>;
+
+def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert GR32:$src))]>;
+
+def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
+
+// SSE2 instructions with XS prefix
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+ Requires<[HasSSE2]>;
+def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(store (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+
+def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+ (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+
+def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+ (iPTR 0)))]>;
+def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (vector_extract (v4i32 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+
+def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bitconvert FR32:$src))]>;
+def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
+
+// Store / copy lower 64-bits of a XMM register.
+def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
+
+// movd / movq to XMM register zero-extends
+let AddedComplexity = 15 in {
+def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector GR32:$src)))))]>;
+// This is X86-64 only.
+def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+ "mov{d|q}\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl
+ (v2i64 (scalar_to_vector GR64:$src)))))]>;
+}
+
+let AddedComplexity = 20 in {
+def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ "movd\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
+ (loadi32 addr:$src))))))]>;
+
+def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
+ (MOVZDI2PDIrm addr:$src)>;
+def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+ (MOVZDI2PDIrm addr:$src)>;
+def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+ (MOVZDI2PDIrm addr:$src)>;
+
+def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+ (loadi64 addr:$src))))))]>, XS,
+ Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+ (MOVZQI2PQIrm addr:$src)>;
+def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+ (MOVZQI2PQIrm addr:$src)>;
+def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
+}
+
+// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
+// IA32 document. movq xmm1, xmm2 does clear the high bits.
+let AddedComplexity = 15 in
+def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
+ XS, Requires<[HasSSE2]>;
+
+let AddedComplexity = 20 in {
+def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movq\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2i64 (X86vzmovl
+ (loadv2i64 addr:$src))))]>,
+ XS, Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
+ (MOVZPQILo2PQIrm addr:$src)>;
+}
+
+// Instructions for the disassembler
+// xr = XMM register
+// xm = mem64
+
+def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movq\t{$src, $dst|$dst, $src}", []>, XS;
+
+//===---------------------------------------------------------------------===//
+// SSE3 Instructions
+//===---------------------------------------------------------------------===//
+
+// Move Instructions
+def MOVSHDUPrr : S3SI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movshdup\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4f32 (movshdup
+ VR128:$src, (undef))))]>;
+def MOVSHDUPrm : S3SI<0x16, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movshdup\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (movshdup
+ (memopv4f32 addr:$src), (undef)))]>;
+
+def MOVSLDUPrr : S3SI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movsldup\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4f32 (movsldup
+ VR128:$src, (undef))))]>;
+def MOVSLDUPrm : S3SI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ "movsldup\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (movsldup
+ (memopv4f32 addr:$src), (undef)))]>;
+
+def MOVDDUPrr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ "movddup\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,(v2f64 (movddup VR128:$src, (undef))))]>;
+def MOVDDUPrm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+ "movddup\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (movddup (scalar_to_vector (loadf64 addr:$src)),
+ (undef))))]>;
+
+def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
+ (undef)),
+ (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+let AddedComplexity = 5 in {
+def : Pat<(movddup (memopv2f64 addr:$src), (undef)),
+ (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)),
+ (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+def : Pat<(movddup (memopv2i64 addr:$src), (undef)),
+ (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
+ (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+}
+
+// Arithmetic
+let Constraints = "$src1 = $dst" in {
+ def ADDSUBPSrr : S3DI<0xD0, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "addsubps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
+ VR128:$src2))]>;
+ def ADDSUBPSrm : S3DI<0xD0, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "addsubps\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
+ (memop addr:$src2)))]>;
+ def ADDSUBPDrr : S3I<0xD0, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ "addsubpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
+ VR128:$src2))]>;
+ def ADDSUBPDrm : S3I<0xD0, MRMSrcMem,
+ (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ "addsubpd\t{$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
+ (memop addr:$src2)))]>;
+}
+
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "lddqu\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
+
+// Horizontal ops
+class S3D_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : S3DI<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v4f32 (IntId VR128:$src1, VR128:$src2)))]>;
+class S3D_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : S3DI<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v4f32 (IntId VR128:$src1, (memop addr:$src2))))]>;
+class S3_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : S3I<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v2f64 (IntId VR128:$src1, VR128:$src2)))]>;
+class S3_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : S3I<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v2f64 (IntId VR128:$src1, (memopv2f64 addr:$src2))))]>;
+
+let Constraints = "$src1 = $dst" in {
+ def HADDPSrr : S3D_Intrr<0x7C, "haddps", int_x86_sse3_hadd_ps>;
+ def HADDPSrm : S3D_Intrm<0x7C, "haddps", int_x86_sse3_hadd_ps>;
+ def HADDPDrr : S3_Intrr <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
+ def HADDPDrm : S3_Intrm <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
+ def HSUBPSrr : S3D_Intrr<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
+ def HSUBPSrm : S3D_Intrm<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
+ def HSUBPDrr : S3_Intrr <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
+ def HSUBPDrm : S3_Intrm <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
+}
+
+// Thread synchronization
+def MONITOR : I<0x01, MRM_C8, (outs), (ins), "monitor",
+ [(int_x86_sse3_monitor EAX, ECX, EDX)]>,TB, Requires<[HasSSE3]>;
+def MWAIT : I<0x01, MRM_C9, (outs), (ins), "mwait",
+ [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
+
+// vector_shuffle v1, <undef> <1, 1, 3, 3>
+let AddedComplexity = 15 in
+def : Pat<(v4i32 (movshdup VR128:$src, (undef))),
+ (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+def : Pat<(v4i32 (movshdup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
+ (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+// vector_shuffle v1, <undef> <0, 0, 2, 2>
+let AddedComplexity = 15 in
+ def : Pat<(v4i32 (movsldup VR128:$src, (undef))),
+ (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+ def : Pat<(v4i32 (movsldup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
+ (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+//===---------------------------------------------------------------------===//
+// SSSE3 Instructions
+//===---------------------------------------------------------------------===//
+
+/// SS3I_unop_rm_int_8 - Simple SSSE3 unary operator whose type is v*i8.
+multiclass SS3I_unop_rm_int_8<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, Intrinsic IntId128> {
+ def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst, (IntId64 VR64:$src))]>;
+
+ def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst,
+ (IntId64 (bitconvert (memopv8i8 addr:$src))))]>;
+
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId128 VR128:$src))]>,
+ OpSize;
+
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId128
+ (bitconvert (memopv16i8 addr:$src))))]>, OpSize;
+}
+
+/// SS3I_unop_rm_int_16 - Simple SSSE3 unary operator whose type is v*i16.
+multiclass SS3I_unop_rm_int_16<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, Intrinsic IntId128> {
+ def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst, (IntId64 VR64:$src))]>;
+
+ def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins i64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst,
+ (IntId64
+ (bitconvert (memopv4i16 addr:$src))))]>;
+
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId128 VR128:$src))]>,
+ OpSize;
+
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId128
+ (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
+}
+
+/// SS3I_unop_rm_int_32 - Simple SSSE3 unary operator whose type is v*i32.
+multiclass SS3I_unop_rm_int_32<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, Intrinsic IntId128> {
+ def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst, (IntId64 VR64:$src))]>;
+
+ def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins i64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR64:$dst,
+ (IntId64
+ (bitconvert (memopv2i32 addr:$src))))]>;
+
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId128 VR128:$src))]>,
+ OpSize;
+
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId128
+ (bitconvert (memopv4i32 addr:$src))))]>, OpSize;
+}
+
+defm PABSB : SS3I_unop_rm_int_8 <0x1C, "pabsb",
+ int_x86_ssse3_pabs_b,
+ int_x86_ssse3_pabs_b_128>;
+defm PABSW : SS3I_unop_rm_int_16<0x1D, "pabsw",
+ int_x86_ssse3_pabs_w,
+ int_x86_ssse3_pabs_w_128>;
+defm PABSD : SS3I_unop_rm_int_32<0x1E, "pabsd",
+ int_x86_ssse3_pabs_d,
+ int_x86_ssse3_pabs_d_128>;
+
+/// SS3I_binop_rm_int_8 - Simple SSSE3 binary operator whose type is v*i8.
+let Constraints = "$src1 = $dst" in {
+ multiclass SS3I_binop_rm_int_8<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, Intrinsic IntId128,
+ bit Commutable = 0> {
+ def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
+ let isCommutable = Commutable;
+ }
+ def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst,
+ (IntId64 VR64:$src1,
+ (bitconvert (memopv8i8 addr:$src2))))]>;
+
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+ }
+}
+
+/// SS3I_binop_rm_int_16 - Simple SSSE3 binary operator whose type is v*i16.
+let Constraints = "$src1 = $dst" in {
+ multiclass SS3I_binop_rm_int_16<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, Intrinsic IntId128,
+ bit Commutable = 0> {
+ def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
+ let isCommutable = Commutable;
+ }
+ def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst,
+ (IntId64 VR64:$src1,
+ (bitconvert (memopv4i16 addr:$src2))))]>;
+
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (memopv8i16 addr:$src2))))]>, OpSize;
+ }
+}
+
+/// SS3I_binop_rm_int_32 - Simple SSSE3 binary operator whose type is v*i32.
+let Constraints = "$src1 = $dst" in {
+ multiclass SS3I_binop_rm_int_32<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId64, Intrinsic IntId128,
+ bit Commutable = 0> {
+ def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
+ let isCommutable = Commutable;
+ }
+ def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst,
+ (IntId64 VR64:$src1,
+ (bitconvert (memopv2i32 addr:$src2))))]>;
+
+ def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (memopv4i32 addr:$src2))))]>, OpSize;
+ }
+}
+
+let ImmT = NoImm in { // None of these have i8 immediate fields.
+defm PHADDW : SS3I_binop_rm_int_16<0x01, "phaddw",
+ int_x86_ssse3_phadd_w,
+ int_x86_ssse3_phadd_w_128>;
+defm PHADDD : SS3I_binop_rm_int_32<0x02, "phaddd",
+ int_x86_ssse3_phadd_d,
+ int_x86_ssse3_phadd_d_128>;
+defm PHADDSW : SS3I_binop_rm_int_16<0x03, "phaddsw",
+ int_x86_ssse3_phadd_sw,
+ int_x86_ssse3_phadd_sw_128>;
+defm PHSUBW : SS3I_binop_rm_int_16<0x05, "phsubw",
+ int_x86_ssse3_phsub_w,
+ int_x86_ssse3_phsub_w_128>;
+defm PHSUBD : SS3I_binop_rm_int_32<0x06, "phsubd",
+ int_x86_ssse3_phsub_d,
+ int_x86_ssse3_phsub_d_128>;
+defm PHSUBSW : SS3I_binop_rm_int_16<0x07, "phsubsw",
+ int_x86_ssse3_phsub_sw,
+ int_x86_ssse3_phsub_sw_128>;
+defm PMADDUBSW : SS3I_binop_rm_int_8 <0x04, "pmaddubsw",
+ int_x86_ssse3_pmadd_ub_sw,
+ int_x86_ssse3_pmadd_ub_sw_128>;
+defm PMULHRSW : SS3I_binop_rm_int_16<0x0B, "pmulhrsw",
+ int_x86_ssse3_pmul_hr_sw,
+ int_x86_ssse3_pmul_hr_sw_128, 1>;
+
+defm PSHUFB : SS3I_binop_rm_int_8 <0x00, "pshufb",
+ int_x86_ssse3_pshuf_b,
+ int_x86_ssse3_pshuf_b_128>;
+defm PSIGNB : SS3I_binop_rm_int_8 <0x08, "psignb",
+ int_x86_ssse3_psign_b,
+ int_x86_ssse3_psign_b_128>;
+defm PSIGNW : SS3I_binop_rm_int_16<0x09, "psignw",
+ int_x86_ssse3_psign_w,
+ int_x86_ssse3_psign_w_128>;
+defm PSIGND : SS3I_binop_rm_int_32<0x0A, "psignd",
+ int_x86_ssse3_psign_d,
+ int_x86_ssse3_psign_d_128>;
+}
+
+// palignr patterns.
+let Constraints = "$src1 = $dst" in {
+ def PALIGNR64rr : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+ (ins VR64:$src1, VR64:$src2, i8imm:$src3),
+ "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ []>;
+ def PALIGNR64rm : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+ (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
+ "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ []>;
+
+ def PALIGNR128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ []>, OpSize;
+ def PALIGNR128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ []>, OpSize;
+}
+
+let AddedComplexity = 5 in {
+
+def : Pat<(v1i64 (palign:$src3 VR64:$src1, VR64:$src2)),
+ (PALIGNR64rr VR64:$src2, VR64:$src1,
+ (SHUFFLE_get_palign_imm VR64:$src3))>,
+ Requires<[HasSSSE3]>;
+def : Pat<(v2i32 (palign:$src3 VR64:$src1, VR64:$src2)),
+ (PALIGNR64rr VR64:$src2, VR64:$src1,
+ (SHUFFLE_get_palign_imm VR64:$src3))>,
+ Requires<[HasSSSE3]>;
+def : Pat<(v2f32 (palign:$src3 VR64:$src1, VR64:$src2)),
+ (PALIGNR64rr VR64:$src2, VR64:$src1,
+ (SHUFFLE_get_palign_imm VR64:$src3))>,
+ Requires<[HasSSSE3]>;
+def : Pat<(v4i16 (palign:$src3 VR64:$src1, VR64:$src2)),
+ (PALIGNR64rr VR64:$src2, VR64:$src1,
+ (SHUFFLE_get_palign_imm VR64:$src3))>,
+ Requires<[HasSSSE3]>;
+def : Pat<(v8i8 (palign:$src3 VR64:$src1, VR64:$src2)),
+ (PALIGNR64rr VR64:$src2, VR64:$src1,
+ (SHUFFLE_get_palign_imm VR64:$src3))>,
+ Requires<[HasSSSE3]>;
+
+def : Pat<(v4i32 (palign:$src3 VR128:$src1, VR128:$src2)),
+ (PALIGNR128rr VR128:$src2, VR128:$src1,
+ (SHUFFLE_get_palign_imm VR128:$src3))>,
+ Requires<[HasSSSE3]>;
+def : Pat<(v4f32 (palign:$src3 VR128:$src1, VR128:$src2)),
+ (PALIGNR128rr VR128:$src2, VR128:$src1,
+ (SHUFFLE_get_palign_imm VR128:$src3))>,
+ Requires<[HasSSSE3]>;
+def : Pat<(v8i16 (palign:$src3 VR128:$src1, VR128:$src2)),
+ (PALIGNR128rr VR128:$src2, VR128:$src1,
+ (SHUFFLE_get_palign_imm VR128:$src3))>,
+ Requires<[HasSSSE3]>;
+def : Pat<(v16i8 (palign:$src3 VR128:$src1, VR128:$src2)),
+ (PALIGNR128rr VR128:$src2, VR128:$src1,
+ (SHUFFLE_get_palign_imm VR128:$src3))>,
+ Requires<[HasSSSE3]>;
+}
+
+def : Pat<(X86pshufb VR128:$src, VR128:$mask),
+ (PSHUFBrr128 VR128:$src, VR128:$mask)>, Requires<[HasSSSE3]>;
+def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
+ (PSHUFBrm128 VR128:$src, addr:$mask)>, Requires<[HasSSSE3]>;
+
+//===---------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===---------------------------------------------------------------------===//
+
+// extload f32 -> f64. This matches load+fextend because we have a hack in
+// the isel (PreprocessForFPConvert) that can introduce loads after dag
+// combine.
+// Since these loads aren't folded into the fextend, we have to match it
+// explicitly here.
+let Predicates = [HasSSE2] in
+ def : Pat<(fextend (loadf32 addr:$src)),
+ (CVTSS2SDrm addr:$src)>;
+
+// bit_convert
+let Predicates = [HasSSE2] in {
+ def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+}
+
+// Move scalar to XMM zero-extended
+// movd to XMM register zero-extends
+let AddedComplexity = 15 in {
+// Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
+def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+ (MOVSDrr (v2f64 (V_SET0PS)), FR64:$src)>;
+def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+ (MOVSSrr (v4f32 (V_SET0PS)), FR32:$src)>;
+def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+ (MOVSSrr (v4f32 (V_SET0PS)),
+ (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
+def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+ (MOVSSrr (v4i32 (V_SET0PI)),
+ (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
+}
+
+// Splat v2f64 / v2i64
+let AddedComplexity = 10 in {
+def : Pat<(splat_lo (v2f64 VR128:$src), (undef)),
+ (UNPCKLPDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(unpckh (v2f64 VR128:$src), (undef)),
+ (UNPCKHPDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
+ (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(unpckh (v2i64 VR128:$src), (undef)),
+ (PUNPCKHQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+}
+
+// Special unary SHUFPSrri case.
+def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
+ (SHUFPSrri VR128:$src1, VR128:$src1,
+ (SHUFFLE_get_shuf_imm VR128:$src3))>;
+let AddedComplexity = 5 in
+def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
+ (PSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+ Requires<[HasSSE2]>;
+// Special unary SHUFPDrri case.
+def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
+ (SHUFPDrri VR128:$src1, VR128:$src1,
+ (SHUFFLE_get_shuf_imm VR128:$src3))>,
+ Requires<[HasSSE2]>;
+// Special unary SHUFPDrri case.
+def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
+ (SHUFPDrri VR128:$src1, VR128:$src1,
+ (SHUFFLE_get_shuf_imm VR128:$src3))>,
+ Requires<[HasSSE2]>;
+// Unary v4f32 shuffle with PSHUF* in order to fold a load.
+def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
+ (PSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+ Requires<[HasSSE2]>;
+
+// Special binary v4i32 shuffle cases with SHUFPS.
+def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
+ (SHUFPSrri VR128:$src1, VR128:$src2,
+ (SHUFFLE_get_shuf_imm VR128:$src3))>,
+ Requires<[HasSSE2]>;
+def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)))),
+ (SHUFPSrmi VR128:$src1, addr:$src2,
+ (SHUFFLE_get_shuf_imm VR128:$src3))>,
+ Requires<[HasSSE2]>;
+// Special binary v2i64 shuffle cases using SHUFPDrri.
+def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
+ (SHUFPDrri VR128:$src1, VR128:$src2,
+ (SHUFFLE_get_shuf_imm VR128:$src3))>,
+ Requires<[HasSSE2]>;
+
+// vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
+let AddedComplexity = 15 in {
+def : Pat<(v4i32 (unpckl_undef:$src2 VR128:$src, (undef))),
+ (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+ Requires<[OptForSpeed, HasSSE2]>;
+def : Pat<(v4f32 (unpckl_undef:$src2 VR128:$src, (undef))),
+ (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+ Requires<[OptForSpeed, HasSSE2]>;
+}
+let AddedComplexity = 10 in {
+def : Pat<(v4f32 (unpckl_undef VR128:$src, (undef))),
+ (UNPCKLPSrr VR128:$src, VR128:$src)>;
+def : Pat<(v16i8 (unpckl_undef VR128:$src, (undef))),
+ (PUNPCKLBWrr VR128:$src, VR128:$src)>;
+def : Pat<(v8i16 (unpckl_undef VR128:$src, (undef))),
+ (PUNPCKLWDrr VR128:$src, VR128:$src)>;
+def : Pat<(v4i32 (unpckl_undef VR128:$src, (undef))),
+ (PUNPCKLDQrr VR128:$src, VR128:$src)>;
+}
+
+// vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
+let AddedComplexity = 15 in {
+def : Pat<(v4i32 (unpckh_undef:$src2 VR128:$src, (undef))),
+ (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+ Requires<[OptForSpeed, HasSSE2]>;
+def : Pat<(v4f32 (unpckh_undef:$src2 VR128:$src, (undef))),
+ (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+ Requires<[OptForSpeed, HasSSE2]>;
+}
+let AddedComplexity = 10 in {
+def : Pat<(v4f32 (unpckh_undef VR128:$src, (undef))),
+ (UNPCKHPSrr VR128:$src, VR128:$src)>;
+def : Pat<(v16i8 (unpckh_undef VR128:$src, (undef))),
+ (PUNPCKHBWrr VR128:$src, VR128:$src)>;
+def : Pat<(v8i16 (unpckh_undef VR128:$src, (undef))),
+ (PUNPCKHWDrr VR128:$src, VR128:$src)>;
+def : Pat<(v4i32 (unpckh_undef VR128:$src, (undef))),
+ (PUNPCKHDQrr VR128:$src, VR128:$src)>;
+}
+
+let AddedComplexity = 20 in {
+// vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
+def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
+ (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+
+// vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
+def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
+ (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+
+// vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
+def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
+ (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
+ (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+}
+
+let AddedComplexity = 20 in {
+// vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
+def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
+def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
+ (MOVLPDrm VR128:$src1, addr:$src2)>;
+}
+
+// (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+ (MOVLPSmr addr:$src1, VR128:$src2)>;
+def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+ (MOVLPDmr addr:$src1, VR128:$src2)>;
+def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
+ addr:$src1),
+ (MOVLPSmr addr:$src1, VR128:$src2)>;
+def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+ (MOVLPDmr addr:$src1, VR128:$src2)>;
+
+let AddedComplexity = 15 in {
+// Setting the lowest element in the vector.
+def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
+ (MOVSSrr (v4i32 VR128:$src1),
+ (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
+def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
+ (MOVSDrr (v2i64 VR128:$src1),
+ (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
+
+// vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
+def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>,
+ Requires<[HasSSE2]>;
+def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>,
+ Requires<[HasSSE2]>;
+}
+
+// vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
+// fall back to this for SSE1)
+def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
+ (SHUFPSrri VR128:$src2, VR128:$src1,
+ (SHUFFLE_get_shuf_imm VR128:$src3))>;
+
+// Set lowest element and zero upper elements.
+def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+ (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
+
+// Some special case pandn patterns.
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+ VR128:$src2)),
+ (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+ VR128:$src2)),
+ (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+ VR128:$src2)),
+ (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+ (memop addr:$src2))),
+ (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+ (memop addr:$src2))),
+ (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+ (memop addr:$src2))),
+ (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+
+// vector -> vector casts
+def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+ (Int_CVTDQ2PSrr VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+ (Int_CVTTPS2DQrr VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v2f64 (sint_to_fp (v2i32 VR64:$src))),
+ (Int_CVTPI2PDrr VR64:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v2i32 (fp_to_sint (v2f64 VR128:$src))),
+ (Int_CVTTPD2PIrr VR128:$src)>, Requires<[HasSSE2]>;
+
+// Use movaps / movups for SSE integer load / store (one byte shorter).
+def : Pat<(alignedloadv4i32 addr:$src),
+ (MOVAPSrm addr:$src)>;
+def : Pat<(loadv4i32 addr:$src),
+ (MOVUPSrm addr:$src)>;
+def : Pat<(alignedloadv2i64 addr:$src),
+ (MOVAPSrm addr:$src)>;
+def : Pat<(loadv2i64 addr:$src),
+ (MOVUPSrm addr:$src)>;
+
+def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+ (MOVAPSmr addr:$dst, VR128:$src)>;
+def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+ (MOVUPSmr addr:$dst, VR128:$src)>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd,
+ string OpcodeStr,
+ Intrinsic V4F32Int,
+ Intrinsic V2F64Int> {
+ // Intrinsic operation, reg.
+ // Vector intrinsic operation, reg
+ def PSr_Int : SS4AIi8<opcps, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (V4F32Int VR128:$src1, imm:$src2))]>,
+ OpSize;
+
+ // Vector intrinsic operation, mem
+ def PSm_Int : Ii8<opcps, MRMSrcMem,
+ (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (V4F32Int (memopv4f32 addr:$src1),imm:$src2))]>,
+ TA, OpSize,
+ Requires<[HasSSE41]>;
+
+ // Vector intrinsic operation, reg
+ def PDr_Int : SS4AIi8<opcpd, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst, (V2F64Int VR128:$src1, imm:$src2))]>,
+ OpSize;
+
+ // Vector intrinsic operation, mem
+ def PDm_Int : SS4AIi8<opcpd, MRMSrcMem,
+ (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (V2F64Int (memopv2f64 addr:$src1),imm:$src2))]>,
+ OpSize;
+}
+
+let Constraints = "$src1 = $dst" in {
+multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
+ string OpcodeStr,
+ Intrinsic F32Int,
+ Intrinsic F64Int> {
+ // Intrinsic operation, reg.
+ def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+ (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+ OpSize;
+
+ // Intrinsic operation, mem.
+ def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
+ (outs VR128:$dst),
+ (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+ OpSize;
+
+ // Intrinsic operation, reg.
+ def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+ (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+ OpSize;
+
+ // Intrinsic operation, mem.
+ def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
+ (outs VR128:$dst),
+ (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+ OpSize;
+}
+}
+
+// FP round - roundss, roundps, roundsd, roundpd
+defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round",
+ int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
+defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round",
+ int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
+
+// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
+multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128> {
+ def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize;
+ def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId128
+ (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
+}
+
+defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
+ int_x86_sse41_phminposuw>;
+
+/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+let Constraints = "$src1 = $dst" in {
+ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128, bit Commutable = 0> {
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+ }
+}
+
+defm PCMPEQQ : SS41I_binop_rm_int<0x29, "pcmpeqq",
+ int_x86_sse41_pcmpeqq, 1>;
+defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw",
+ int_x86_sse41_packusdw, 0>;
+defm PMINSB : SS41I_binop_rm_int<0x38, "pminsb",
+ int_x86_sse41_pminsb, 1>;
+defm PMINSD : SS41I_binop_rm_int<0x39, "pminsd",
+ int_x86_sse41_pminsd, 1>;
+defm PMINUD : SS41I_binop_rm_int<0x3B, "pminud",
+ int_x86_sse41_pminud, 1>;
+defm PMINUW : SS41I_binop_rm_int<0x3A, "pminuw",
+ int_x86_sse41_pminuw, 1>;
+defm PMAXSB : SS41I_binop_rm_int<0x3C, "pmaxsb",
+ int_x86_sse41_pmaxsb, 1>;
+defm PMAXSD : SS41I_binop_rm_int<0x3D, "pmaxsd",
+ int_x86_sse41_pmaxsd, 1>;
+defm PMAXUD : SS41I_binop_rm_int<0x3F, "pmaxud",
+ int_x86_sse41_pmaxud, 1>;
+defm PMAXUW : SS41I_binop_rm_int<0x3E, "pmaxuw",
+ int_x86_sse41_pmaxuw, 1>;
+
+defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq, 1>;
+
+def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
+ (PCMPEQQrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
+ (PCMPEQQrm VR128:$src1, addr:$src2)>;
+
+/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+let Constraints = "$src1 = $dst" in {
+ multiclass SS41I_binop_patint<bits<8> opc, string OpcodeStr, ValueType OpVT,
+ SDNode OpNode, Intrinsic IntId128,
+ bit Commutable = 0> {
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode (OpVT VR128:$src1),
+ VR128:$src2))]>, OpSize {
+ let isCommutable = Commutable;
+ }
+ def rr_int : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (OpVT (OpNode VR128:$src1, (memop addr:$src2))))]>, OpSize;
+ def rm_int : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1, (memop addr:$src2)))]>,
+ OpSize;
+ }
+}
+
+/// SS48I_binop_rm - Simple SSE41 binary operator.
+let Constraints = "$src1 = $dst" in {
+multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, bit Commutable = 0> {
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1,
+ (bc_v4i32 (memopv2i64 addr:$src2))))]>,
+ OpSize;
+}
+}
+
+defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, 1>;
+
+/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
+let Constraints = "$src1 = $dst" in {
+ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128, bit Commutable = 0> {
+ def rri : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1, VR128:$src2, imm:$src3))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rmi : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (memopv16i8 addr:$src2)), imm:$src3))]>,
+ OpSize;
+ }
+}
+
+defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps",
+ int_x86_sse41_blendps, 0>;
+defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd",
+ int_x86_sse41_blendpd, 0>;
+defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw",
+ int_x86_sse41_pblendw, 0>;
+defm DPPS : SS41I_binop_rmi_int<0x40, "dpps",
+ int_x86_sse41_dpps, 1>;
+defm DPPD : SS41I_binop_rmi_int<0x41, "dppd",
+ int_x86_sse41_dppd, 1>;
+defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw",
+ int_x86_sse41_mpsadbw, 0>;
+
+
+/// SS41I_ternary_int - SSE 4.1 ternary operator
+let Uses = [XMM0], Constraints = "$src1 = $dst" in {
+ multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+ def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr,
+ "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
+ [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
+ OpSize;
+
+ def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr,
+ "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
+ [(set VR128:$dst,
+ (IntId VR128:$src1,
+ (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize;
+ }
+}
+
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
+
+
+multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
+ OpSize;
+}
+
+defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
+defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>;
+defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>;
+defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
+defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
+defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
+
+// Common patterns involving scalar load.
+def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
+ (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
+ (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
+ (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
+ (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
+ (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
+ (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
+ (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
+ (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
+ (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
+ (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
+ (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
+ (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
+
+
+multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst,
+ (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
+ OpSize;
+}
+
+defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
+defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
+defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
+defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
+
+// Common patterns involving scalar load
+def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
+ (PMOVSXBDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
+ (PMOVSXWQrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
+ (PMOVZXBDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
+ (PMOVZXWQrm addr:$src)>, Requires<[HasSSE41]>;
+
+
+multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+ def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+ // Expecting a i16 load any extended to i32 value.
+ def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (IntId (bitconvert
+ (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
+ OpSize;
+}
+
+defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
+defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
+
+// Common patterns involving scalar load
+def : Pat<(int_x86_sse41_pmovsxbq
+ (bitconvert (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+ (PMOVSXBQrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxbq
+ (bitconvert (v4i32 (X86vzmovl
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+ (PMOVZXBQrm addr:$src)>, Requires<[HasSSE41]>;
+
+
+/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
+multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+ (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
+ OpSize;
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, OpSize;
+// FIXME:
+// There's an AssertZext in the way of writing the store pattern
+// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
+}
+
+defm PEXTRB : SS41I_extract8<0x14, "pextrb">;
+
+
+/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
+multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, OpSize;
+// FIXME:
+// There's an AssertZext in the way of writing the store pattern
+// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
+}
+
+defm PEXTRW : SS41I_extract16<0x15, "pextrw">;
+
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+ (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32:$dst,
+ (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize;
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
+ addr:$dst)]>, OpSize;
+}
+
+defm PEXTRD : SS41I_extract32<0x16, "pextrd">;
+
+
+/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
+/// destination
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+ (ins VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set GR32:$dst,
+ (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
+ OpSize;
+ def mr : SS4AIi8<opc, MRMDestMem, (outs),
+ (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
+ addr:$dst)]>, OpSize;
+}
+
+defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">;
+
+// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+ imm:$src2))),
+ addr:$dst),
+ (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+ Requires<[HasSSE41]>;
+
+let Constraints = "$src1 = $dst" in {
+ multiclass SS41I_insert8<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
+ imm:$src3))]>, OpSize;
+ }
+}
+
+defm PINSRB : SS41I_insert8<0x20, "pinsrb">;
+
+let Constraints = "$src1 = $dst" in {
+ multiclass SS41I_insert32<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
+ OpSize;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
+ imm:$src3)))]>, OpSize;
+ }
+}
+
+defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
+
+// insertps has a few different modes, there's the first two here below which
+// are optimized inserts that won't zero arbitrary elements in the destination
+// vector. The next one matches the intrinsic and could zero arbitrary elements
+// in the target vector.
+let Constraints = "$src1 = $dst" in {
+ multiclass SS41I_insertf32<bits<8> opc, string OpcodeStr> {
+ def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
+ OpSize;
+ def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f32mem:$src2, i32i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (X86insrtps VR128:$src1,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+ imm:$src3))]>, OpSize;
+ }
+}
+
+defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
+
+def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
+ (INSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>;
+
+// ptest instruction we'll lower to this in X86ISelLowering primarily from
+// the intel intrinsic that corresponds to this.
+let Defs = [EFLAGS] in {
+def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+ "ptest \t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR128:$src1, VR128:$src2))]>,
+ OpSize;
+def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2),
+ "ptest \t{$src2, $src1|$src1, $src2}",
+ [(set EFLAGS, (X86ptest VR128:$src1, (load addr:$src2)))]>,
+ OpSize;
+}
+
+def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+ "movntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
+ OpSize;
+
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS42I_binop_rm_int - Simple SSE 4.2 binary operator
+let Constraints = "$src1 = $dst" in {
+ multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128, bit Commutable = 0> {
+ def rr : SS428I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rm : SS428I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+ }
+}
+
+defm PCMPGTQ : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>;
+
+def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
+ (PCMPGTQrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
+ (PCMPGTQrm VR128:$src1, addr:$src2)>;
+
+// crc intrinsic instruction
+// This set of instructions are only rm, the only difference is the size
+// of r and m.
+let Constraints = "$src1 = $dst" in {
+ def CRC32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i8mem:$src2),
+ "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ [(set GR32:$dst,
+ (int_x86_sse42_crc32_8 GR32:$src1,
+ (load addr:$src2)))]>;
+ def CRC32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR8:$src2),
+ "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ [(set GR32:$dst,
+ (int_x86_sse42_crc32_8 GR32:$src1, GR8:$src2))]>;
+ def CRC32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i16mem:$src2),
+ "crc32{w} \t{$src2, $src1|$src1, $src2}",
+ [(set GR32:$dst,
+ (int_x86_sse42_crc32_16 GR32:$src1,
+ (load addr:$src2)))]>,
+ OpSize;
+ def CRC32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR16:$src2),
+ "crc32{w} \t{$src2, $src1|$src1, $src2}",
+ [(set GR32:$dst,
+ (int_x86_sse42_crc32_16 GR32:$src1, GR16:$src2))]>,
+ OpSize;
+ def CRC32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$src1, i32mem:$src2),
+ "crc32{l} \t{$src2, $src1|$src1, $src2}",
+ [(set GR32:$dst,
+ (int_x86_sse42_crc32_32 GR32:$src1,
+ (load addr:$src2)))]>;
+ def CRC32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
+ (ins GR32:$src1, GR32:$src2),
+ "crc32{l} \t{$src2, $src1|$src1, $src2}",
+ [(set GR32:$dst,
+ (int_x86_sse42_crc32_32 GR32:$src1, GR32:$src2))]>;
+ def CRC64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i8mem:$src2),
+ "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ [(set GR64:$dst,
+ (int_x86_sse42_crc64_8 GR64:$src1,
+ (load addr:$src2)))]>,
+ REX_W;
+ def CRC64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR8:$src2),
+ "crc32{b} \t{$src2, $src1|$src1, $src2}",
+ [(set GR64:$dst,
+ (int_x86_sse42_crc64_8 GR64:$src1, GR8:$src2))]>,
+ REX_W;
+ def CRC64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$src1, i64mem:$src2),
+ "crc32{q} \t{$src2, $src1|$src1, $src2}",
+ [(set GR64:$dst,
+ (int_x86_sse42_crc64_64 GR64:$src1,
+ (load addr:$src2)))]>,
+ REX_W;
+ def CRC64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
+ (ins GR64:$src1, GR64:$src2),
+ "crc32{q} \t{$src2, $src1|$src1, $src2}",
+ [(set GR64:$dst,
+ (int_x86_sse42_crc64_64 GR64:$src1, GR64:$src2))]>,
+ REX_W;
+}
+
+// String/text processing instructions.
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+def PCMPISTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ "#PCMPISTRM128rr PSEUDO!",
+ [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
+ imm:$src3))]>, OpSize;
+def PCMPISTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ "#PCMPISTRM128rm PSEUDO!",
+ [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, (load addr:$src2),
+ imm:$src3))]>, OpSize;
+}
+
+let Defs = [XMM0, EFLAGS] in {
+def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
+def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+def PCMPESTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+ "#PCMPESTRM128rr PSEUDO!",
+ [(set VR128:$dst,
+ (int_x86_sse42_pcmpestrm128
+ VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>, OpSize;
+
+def PCMPESTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+ "#PCMPESTRM128rm PSEUDO!",
+ [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
+ VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>,
+ OpSize;
+}
+
+let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in {
+def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+ "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
+def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+ "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
+}
+
+let Defs = [ECX, EFLAGS] in {
+ multiclass SS42AI_pcmpistri<Intrinsic IntId128> {
+ def rr : SS42AI<0x63, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+ "pcmpistri\t{$src3, $src2, $src1|$src1, $src2, $src3}",
+ [(set ECX, (IntId128 VR128:$src1, VR128:$src2, imm:$src3)),
+ (implicit EFLAGS)]>, OpSize;
+ def rm : SS42AI<0x63, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+ "pcmpistri\t{$src3, $src2, $src1|$src1, $src2, $src3}",
+ [(set ECX, (IntId128 VR128:$src1, (load addr:$src2), imm:$src3)),
+ (implicit EFLAGS)]>, OpSize;
+ }
+}
+
+defm PCMPISTRI : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128>;
+defm PCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128>;
+defm PCMPISTRIC : SS42AI_pcmpistri<int_x86_sse42_pcmpistric128>;
+defm PCMPISTRIO : SS42AI_pcmpistri<int_x86_sse42_pcmpistrio128>;
+defm PCMPISTRIS : SS42AI_pcmpistri<int_x86_sse42_pcmpistris128>;
+defm PCMPISTRIZ : SS42AI_pcmpistri<int_x86_sse42_pcmpistriz128>;
+
+let Defs = [ECX, EFLAGS] in {
+let Uses = [EAX, EDX] in {
+ multiclass SS42AI_pcmpestri<Intrinsic IntId128> {
+ def rr : SS42AI<0x61, MRMSrcReg, (outs),
+ (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+ "pcmpestri\t{$src5, $src3, $src1|$src1, $src3, $src5}",
+ [(set ECX, (IntId128 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5)),
+ (implicit EFLAGS)]>, OpSize;
+ def rm : SS42AI<0x61, MRMSrcMem, (outs),
+ (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+ "pcmpestri\t{$src5, $src3, $src1|$src1, $src3, $src5}",
+ [(set ECX,
+ (IntId128 VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5)),
+ (implicit EFLAGS)]>, OpSize;
+ }
+}
+}
+
+defm PCMPESTRI : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128>;
+defm PCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128>;
+defm PCMPESTRIC : SS42AI_pcmpestri<int_x86_sse42_pcmpestric128>;
+defm PCMPESTRIO : SS42AI_pcmpestri<int_x86_sse42_pcmpestrio128>;
+defm PCMPESTRIS : SS42AI_pcmpestri<int_x86_sse42_pcmpestris128>;
+defm PCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128>;
+
+//===----------------------------------------------------------------------===//
+// AES-NI Instructions
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst" in {
+ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId128, bit Commutable = 0> {
+ def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ OpSize {
+ let isCommutable = Commutable;
+ }
+ def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId128 VR128:$src1,
+ (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+ }
+}
+
+defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
+ int_x86_aesni_aesenc>;
+defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
+ int_x86_aesni_aesenclast>;
+defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
+ int_x86_aesni_aesdec>;
+defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
+ int_x86_aesni_aesdeclast>;
+
+def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
+ (AESENCrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
+ (AESENCrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)),
+ (AESENCLASTrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))),
+ (AESENCLASTrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)),
+ (AESDECrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))),
+ (AESDECrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
+ (AESDECLASTrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
+ (AESDECLASTrm VR128:$src1, addr:$src2)>;
+
+def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1),
+ "aesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aesimc VR128:$src1))]>,
+ OpSize;
+
+def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1),
+ "aesimc\t{$src1, $dst|$dst, $src1}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aesimc (bitconvert (memopv2i64 addr:$src1))))]>,
+ OpSize;
+
+def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, i8imm:$src2),
+ "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+ OpSize;
+def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, i8imm:$src2),
+ "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst,
+ (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)),
+ imm:$src2))]>,
+ OpSize;
diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.cpp b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp
new file mode 100644
index 0000000..6f0a8d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp
@@ -0,0 +1,566 @@
+//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the X86 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "X86JITInfo.h"
+#include "X86Relocations.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Function.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/System/Valgrind.h"
+#include <cstdlib>
+#include <cstring>
+using namespace llvm;
+
+// Determine the platform we're running on
+#if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64)
+# define X86_64_JIT
+#elif defined(__i386__) || defined(i386) || defined(_M_IX86)
+# define X86_32_JIT
+#endif
+
+void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+ unsigned char *OldByte = (unsigned char *)Old;
+ *OldByte++ = 0xE9; // Emit JMP opcode.
+ unsigned *OldWord = (unsigned *)OldByte;
+ unsigned NewAddr = (intptr_t)New;
+ unsigned OldAddr = (intptr_t)OldWord;
+ *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code.
+
+ // X86 doesn't need to invalidate the processor cache, so just invalidate
+ // Valgrind's cache directly.
+ sys::ValgrindDiscardTranslations(Old, 5);
+}
+
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host. This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// For ELF targets, use a .size and .type directive, to let tools
+// know the extent of functions defined in assembler.
+#if defined(__ELF__)
+# define SIZE(sym) ".size " #sym ", . - " #sym "\n"
+# define TYPE_FUNCTION(sym) ".type " #sym ", @function\n"
+#else
+# define SIZE(sym)
+# define TYPE_FUNCTION(sym)
+#endif
+
+// Provide a convenient way for disabling usage of CFI directives.
+// This is needed for old/broken assemblers (for example, gas on
+// Darwin is pretty old and doesn't support these directives)
+#if defined(__APPLE__)
+# define CFI(x)
+#else
+// FIXME: Disable this until we really want to use it. Also, we will
+// need to add some workarounds for compilers, which support
+// only subset of these directives.
+# define CFI(x)
+#endif
+
+// Provide a wrapper for X86CompilationCallback2 that saves non-traditional
+// callee saved registers, for the fastcc calling convention.
+extern "C" {
+#if defined(X86_64_JIT)
+# ifndef _MSC_VER
+ // No need to save EAX/EDX for X86-64.
+ void X86CompilationCallback(void);
+ asm(
+ ".text\n"
+ ".align 8\n"
+ ".globl " ASMPREFIX "X86CompilationCallback\n"
+ TYPE_FUNCTION(X86CompilationCallback)
+ ASMPREFIX "X86CompilationCallback:\n"
+ CFI(".cfi_startproc\n")
+ // Save RBP
+ "pushq %rbp\n"
+ CFI(".cfi_def_cfa_offset 16\n")
+ CFI(".cfi_offset %rbp, -16\n")
+ // Save RSP
+ "movq %rsp, %rbp\n"
+ CFI(".cfi_def_cfa_register %rbp\n")
+ // Save all int arg registers
+ "pushq %rdi\n"
+ CFI(".cfi_rel_offset %rdi, 0\n")
+ "pushq %rsi\n"
+ CFI(".cfi_rel_offset %rsi, 8\n")
+ "pushq %rdx\n"
+ CFI(".cfi_rel_offset %rdx, 16\n")
+ "pushq %rcx\n"
+ CFI(".cfi_rel_offset %rcx, 24\n")
+ "pushq %r8\n"
+ CFI(".cfi_rel_offset %r8, 32\n")
+ "pushq %r9\n"
+ CFI(".cfi_rel_offset %r9, 40\n")
+ // Align stack on 16-byte boundary. ESP might not be properly aligned
+ // (8 byte) if this is called from an indirect stub.
+ "andq $-16, %rsp\n"
+ // Save all XMM arg registers
+ "subq $128, %rsp\n"
+ "movaps %xmm0, (%rsp)\n"
+ "movaps %xmm1, 16(%rsp)\n"
+ "movaps %xmm2, 32(%rsp)\n"
+ "movaps %xmm3, 48(%rsp)\n"
+ "movaps %xmm4, 64(%rsp)\n"
+ "movaps %xmm5, 80(%rsp)\n"
+ "movaps %xmm6, 96(%rsp)\n"
+ "movaps %xmm7, 112(%rsp)\n"
+ // JIT callee
+ "movq %rbp, %rdi\n" // Pass prev frame and return address
+ "movq 8(%rbp), %rsi\n"
+ "call " ASMPREFIX "X86CompilationCallback2\n"
+ // Restore all XMM arg registers
+ "movaps 112(%rsp), %xmm7\n"
+ "movaps 96(%rsp), %xmm6\n"
+ "movaps 80(%rsp), %xmm5\n"
+ "movaps 64(%rsp), %xmm4\n"
+ "movaps 48(%rsp), %xmm3\n"
+ "movaps 32(%rsp), %xmm2\n"
+ "movaps 16(%rsp), %xmm1\n"
+ "movaps (%rsp), %xmm0\n"
+ // Restore RSP
+ "movq %rbp, %rsp\n"
+ CFI(".cfi_def_cfa_register %rsp\n")
+ // Restore all int arg registers
+ "subq $48, %rsp\n"
+ CFI(".cfi_adjust_cfa_offset 48\n")
+ "popq %r9\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %r9\n")
+ "popq %r8\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %r8\n")
+ "popq %rcx\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %rcx\n")
+ "popq %rdx\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %rdx\n")
+ "popq %rsi\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %rsi\n")
+ "popq %rdi\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %rdi\n")
+ // Restore RBP
+ "popq %rbp\n"
+ CFI(".cfi_adjust_cfa_offset -8\n")
+ CFI(".cfi_restore %rbp\n")
+ "ret\n"
+ CFI(".cfi_endproc\n")
+ SIZE(X86CompilationCallback)
+ );
+# else
+ // No inline assembler support on this platform. The routine is in external
+ // file.
+ void X86CompilationCallback();
+
+# endif
+#elif defined (X86_32_JIT)
+# ifndef _MSC_VER
+ void X86CompilationCallback(void);
+ asm(
+ ".text\n"
+ ".align 8\n"
+ ".globl " ASMPREFIX "X86CompilationCallback\n"
+ TYPE_FUNCTION(X86CompilationCallback)
+ ASMPREFIX "X86CompilationCallback:\n"
+ CFI(".cfi_startproc\n")
+ "pushl %ebp\n"
+ CFI(".cfi_def_cfa_offset 8\n")
+ CFI(".cfi_offset %ebp, -8\n")
+ "movl %esp, %ebp\n" // Standard prologue
+ CFI(".cfi_def_cfa_register %ebp\n")
+ "pushl %eax\n"
+ CFI(".cfi_rel_offset %eax, 0\n")
+ "pushl %edx\n" // Save EAX/EDX/ECX
+ CFI(".cfi_rel_offset %edx, 4\n")
+ "pushl %ecx\n"
+ CFI(".cfi_rel_offset %ecx, 8\n")
+# if defined(__APPLE__)
+ "andl $-16, %esp\n" // Align ESP on 16-byte boundary
+# endif
+ "subl $16, %esp\n"
+ "movl 4(%ebp), %eax\n" // Pass prev frame and return address
+ "movl %eax, 4(%esp)\n"
+ "movl %ebp, (%esp)\n"
+ "call " ASMPREFIX "X86CompilationCallback2\n"
+ "movl %ebp, %esp\n" // Restore ESP
+ CFI(".cfi_def_cfa_register %esp\n")
+ "subl $12, %esp\n"
+ CFI(".cfi_adjust_cfa_offset 12\n")
+ "popl %ecx\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %ecx\n")
+ "popl %edx\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %edx\n")
+ "popl %eax\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %eax\n")
+ "popl %ebp\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %ebp\n")
+ "ret\n"
+ CFI(".cfi_endproc\n")
+ SIZE(X86CompilationCallback)
+ );
+
+ // Same as X86CompilationCallback but also saves XMM argument registers.
+ void X86CompilationCallback_SSE(void);
+ asm(
+ ".text\n"
+ ".align 8\n"
+ ".globl " ASMPREFIX "X86CompilationCallback_SSE\n"
+ TYPE_FUNCTION(X86CompilationCallback_SSE)
+ ASMPREFIX "X86CompilationCallback_SSE:\n"
+ CFI(".cfi_startproc\n")
+ "pushl %ebp\n"
+ CFI(".cfi_def_cfa_offset 8\n")
+ CFI(".cfi_offset %ebp, -8\n")
+ "movl %esp, %ebp\n" // Standard prologue
+ CFI(".cfi_def_cfa_register %ebp\n")
+ "pushl %eax\n"
+ CFI(".cfi_rel_offset %eax, 0\n")
+ "pushl %edx\n" // Save EAX/EDX/ECX
+ CFI(".cfi_rel_offset %edx, 4\n")
+ "pushl %ecx\n"
+ CFI(".cfi_rel_offset %ecx, 8\n")
+ "andl $-16, %esp\n" // Align ESP on 16-byte boundary
+ // Save all XMM arg registers
+ "subl $64, %esp\n"
+ // FIXME: provide frame move information for xmm registers.
+ // This can be tricky, because CFA register is ebp (unaligned)
+ // and we need to produce offsets relative to it.
+ "movaps %xmm0, (%esp)\n"
+ "movaps %xmm1, 16(%esp)\n"
+ "movaps %xmm2, 32(%esp)\n"
+ "movaps %xmm3, 48(%esp)\n"
+ "subl $16, %esp\n"
+ "movl 4(%ebp), %eax\n" // Pass prev frame and return address
+ "movl %eax, 4(%esp)\n"
+ "movl %ebp, (%esp)\n"
+ "call " ASMPREFIX "X86CompilationCallback2\n"
+ "addl $16, %esp\n"
+ "movaps 48(%esp), %xmm3\n"
+ CFI(".cfi_restore %xmm3\n")
+ "movaps 32(%esp), %xmm2\n"
+ CFI(".cfi_restore %xmm2\n")
+ "movaps 16(%esp), %xmm1\n"
+ CFI(".cfi_restore %xmm1\n")
+ "movaps (%esp), %xmm0\n"
+ CFI(".cfi_restore %xmm0\n")
+ "movl %ebp, %esp\n" // Restore ESP
+ CFI(".cfi_def_cfa_register esp\n")
+ "subl $12, %esp\n"
+ CFI(".cfi_adjust_cfa_offset 12\n")
+ "popl %ecx\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %ecx\n")
+ "popl %edx\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %edx\n")
+ "popl %eax\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %eax\n")
+ "popl %ebp\n"
+ CFI(".cfi_adjust_cfa_offset -4\n")
+ CFI(".cfi_restore %ebp\n")
+ "ret\n"
+ CFI(".cfi_endproc\n")
+ SIZE(X86CompilationCallback_SSE)
+ );
+# else
+ void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr);
+
+ _declspec(naked) void X86CompilationCallback(void) {
+ __asm {
+ push ebp
+ mov ebp, esp
+ push eax
+ push edx
+ push ecx
+ and esp, -16
+ sub esp, 16
+ mov eax, dword ptr [ebp+4]
+ mov dword ptr [esp+4], eax
+ mov dword ptr [esp], ebp
+ call X86CompilationCallback2
+ mov esp, ebp
+ sub esp, 12
+ pop ecx
+ pop edx
+ pop eax
+ pop ebp
+ ret
+ }
+ }
+
+# endif // _MSC_VER
+
+#else // Not an i386 host
+ void X86CompilationCallback() {
+ llvm_unreachable("Cannot call X86CompilationCallback() on a non-x86 arch!");
+ }
+#endif
+}
+
+/// X86CompilationCallback2 - This is the target-specific function invoked by the
+/// function stub when we did not know the real target of a call. This function
+/// must locate the start of the stub or call site and pass it into the JIT
+/// compiler function.
+extern "C" {
+#if !(defined (X86_64_JIT) && defined(_MSC_VER))
+ // the following function is called only from this translation unit,
+ // unless we are under 64bit Windows with MSC, where there is
+ // no support for inline assembly
+static
+#endif
+void ATTRIBUTE_USED
+X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
+ intptr_t *RetAddrLoc = &StackPtr[1];
+ assert(*RetAddrLoc == RetAddr &&
+ "Could not find return address on the stack!");
+
+ // It's a stub if there is an interrupt marker after the call.
+ bool isStub = ((unsigned char*)RetAddr)[0] == 0xCE;
+
+ // The call instruction should have pushed the return value onto the stack...
+#if defined (X86_64_JIT)
+ RetAddr--; // Backtrack to the reference itself...
+#else
+ RetAddr -= 4; // Backtrack to the reference itself...
+#endif
+
+#if 0
+ DEBUG(dbgs() << "In callback! Addr=" << (void*)RetAddr
+ << " ESP=" << (void*)StackPtr
+ << ": Resolving call to function: "
+ << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n");
+#endif
+
+ // Sanity check to make sure this really is a call instruction.
+#if defined (X86_64_JIT)
+ assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!");
+ assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!");
+#else
+ assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!");
+#endif
+
+ intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr);
+
+ // Rewrite the call target... so that we don't end up here every time we
+ // execute the call.
+#if defined (X86_64_JIT)
+ assert(isStub &&
+ "X86-64 doesn't support rewriting non-stub lazy compilation calls:"
+ " the call instruction varies too much.");
+#else
+ *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4);
+#endif
+
+ if (isStub) {
+ // If this is a stub, rewrite the call into an unconditional branch
+ // instruction so that two return addresses are not pushed onto the stack
+ // when the requested function finally gets called. This also makes the
+ // 0xCE byte (interrupt) dead, so the marker doesn't effect anything.
+#if defined (X86_64_JIT)
+ // If the target address is within 32-bit range of the stub, use a
+ // PC-relative branch instead of loading the actual address. (This is
+ // considerably shorter than the 64-bit immediate load already there.)
+ // We assume here intptr_t is 64 bits.
+ intptr_t diff = NewVal-RetAddr+7;
+ if (diff >= -2147483648LL && diff <= 2147483647LL) {
+ *(unsigned char*)(RetAddr-0xc) = 0xE9;
+ *(intptr_t *)(RetAddr-0xb) = diff & 0xffffffff;
+ } else {
+ *(intptr_t *)(RetAddr - 0xa) = NewVal;
+ ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6));
+ }
+ sys::ValgrindDiscardTranslations((void*)(RetAddr-0xc), 0xd);
+#else
+ ((unsigned char*)RetAddr)[-1] = 0xE9;
+ sys::ValgrindDiscardTranslations((void*)(RetAddr-1), 5);
+#endif
+ }
+
+ // Change the return address to reexecute the call instruction...
+#if defined (X86_64_JIT)
+ *RetAddrLoc -= 0xd;
+#else
+ *RetAddrLoc -= 5;
+#endif
+}
+}
+
+TargetJITInfo::LazyResolverFn
+X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
+ JITCompilerFunction = F;
+
+#if defined (X86_32_JIT) && !defined (_MSC_VER)
+ if (Subtarget->hasSSE1())
+ return X86CompilationCallback_SSE;
+#endif
+
+ return X86CompilationCallback;
+}
+
+X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) {
+ Subtarget = &TM.getSubtarget<X86Subtarget>();
+ useGOT = 0;
+ TLSOffset = 0;
+}
+
+void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+ JITCodeEmitter &JCE) {
+#if defined (X86_64_JIT)
+ const unsigned Alignment = 8;
+ uint8_t Buffer[8];
+ uint8_t *Cur = Buffer;
+ MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(intptr_t)ptr);
+ MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(((intptr_t)ptr) >> 32));
+#else
+ const unsigned Alignment = 4;
+ uint8_t Buffer[4];
+ uint8_t *Cur = Buffer;
+ MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)ptr);
+#endif
+ return JCE.allocIndirectGV(GV, Buffer, sizeof(Buffer), Alignment);
+}
+
+TargetJITInfo::StubLayout X86JITInfo::getStubLayout() {
+ // The 64-bit stub contains:
+ // movabs r10 <- 8-byte-target-address # 10 bytes
+ // call|jmp *r10 # 3 bytes
+ // The 32-bit stub contains a 5-byte call|jmp.
+ // If the stub is a call to the compilation callback, an extra byte is added
+ // to mark it as a stub.
+ StubLayout Result = {14, 4};
+ return Result;
+}
+
+void *X86JITInfo::emitFunctionStub(const Function* F, void *Target,
+ JITCodeEmitter &JCE) {
+ // Note, we cast to intptr_t here to silence a -pedantic warning that
+ // complains about casting a function pointer to a normal pointer.
+#if defined (X86_32_JIT) && !defined (_MSC_VER)
+ bool NotCC = (Target != (void*)(intptr_t)X86CompilationCallback &&
+ Target != (void*)(intptr_t)X86CompilationCallback_SSE);
+#else
+ bool NotCC = Target != (void*)(intptr_t)X86CompilationCallback;
+#endif
+ JCE.emitAlignment(4);
+ void *Result = (void*)JCE.getCurrentPCValue();
+ if (NotCC) {
+#if defined (X86_64_JIT)
+ JCE.emitByte(0x49); // REX prefix
+ JCE.emitByte(0xB8+2); // movabsq r10
+ JCE.emitWordLE((unsigned)(intptr_t)Target);
+ JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32));
+ JCE.emitByte(0x41); // REX prefix
+ JCE.emitByte(0xFF); // jmpq *r10
+ JCE.emitByte(2 | (4 << 3) | (3 << 6));
+#else
+ JCE.emitByte(0xE9);
+ JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4);
+#endif
+ return Result;
+ }
+
+#if defined (X86_64_JIT)
+ JCE.emitByte(0x49); // REX prefix
+ JCE.emitByte(0xB8+2); // movabsq r10
+ JCE.emitWordLE((unsigned)(intptr_t)Target);
+ JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32));
+ JCE.emitByte(0x41); // REX prefix
+ JCE.emitByte(0xFF); // callq *r10
+ JCE.emitByte(2 | (2 << 3) | (3 << 6));
+#else
+ JCE.emitByte(0xE8); // Call with 32 bit pc-rel destination...
+
+ JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4);
+#endif
+
+ // This used to use 0xCD, but that value is used by JITMemoryManager to
+ // initialize the buffer with garbage, which means it may follow a
+ // noreturn function call, confusing X86CompilationCallback2. PR 4929.
+ JCE.emitByte(0xCE); // Interrupt - Just a marker identifying the stub!
+ return Result;
+}
+
+/// getPICJumpTableEntry - Returns the value of the jumptable entry for the
+/// specific basic block.
+uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) {
+#if defined(X86_64_JIT)
+ return BB - Entry;
+#else
+ return BB - PICBase;
+#endif
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void X86JITInfo::relocate(void *Function, MachineRelocation *MR,
+ unsigned NumRelocs, unsigned char* GOTBase) {
+ for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+ void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
+ intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+ switch ((X86::RelocationType)MR->getRelocationType()) {
+ case X86::reloc_pcrel_word: {
+ // PC relative relocation, add the relocated value to the value already in
+ // memory, after we adjust it for where the PC is.
+ ResultPtr = ResultPtr -(intptr_t)RelocPos - 4 - MR->getConstantVal();
+ *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+ break;
+ }
+ case X86::reloc_picrel_word: {
+ // PIC base relative relocation, add the relocated value to the value
+ // already in memory, after we adjust it for where the PIC base is.
+ ResultPtr = ResultPtr - ((intptr_t)Function + MR->getConstantVal());
+ *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+ break;
+ }
+ case X86::reloc_absolute_word:
+ case X86::reloc_absolute_word_sext:
+ // Absolute relocation, just add the relocated value to the value already
+ // in memory.
+ *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+ break;
+ case X86::reloc_absolute_dword:
+ *((intptr_t*)RelocPos) += ResultPtr;
+ break;
+ }
+ }
+}
+
+char* X86JITInfo::allocateThreadLocalMemory(size_t size) {
+#if defined(X86_32_JIT) && !defined(__APPLE__) && !defined(_MSC_VER)
+ TLSOffset -= size;
+ return TLSOffset;
+#else
+ llvm_unreachable("Cannot allocate thread local storage on this arch!");
+ return 0;
+#endif
+}
diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.h b/contrib/llvm/lib/Target/X86/X86JITInfo.h
new file mode 100644
index 0000000..238420c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86JITInfo.h
@@ -0,0 +1,81 @@
+//===- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86JITINFO_H
+#define X86JITINFO_H
+
+#include "llvm/Function.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+ class X86TargetMachine;
+ class X86Subtarget;
+
+ class X86JITInfo : public TargetJITInfo {
+ X86TargetMachine &TM;
+ const X86Subtarget *Subtarget;
+ uintptr_t PICBase;
+ char* TLSOffset;
+ public:
+ explicit X86JITInfo(X86TargetMachine &tm);
+
+ /// replaceMachineCodeForFunction - Make it so that calling the function
+ /// whose machine code is at OLD turns into a call to NEW, perhaps by
+ /// overwriting OLD with a branch to NEW. This is used for self-modifying
+ /// code.
+ ///
+ virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+ /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
+ /// to emit an indirect symbol which contains the address of the specified
+ /// ptr.
+ virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+ JITCodeEmitter &JCE);
+
+ // getStubLayout - Returns the size and alignment of the largest call stub
+ // on X86.
+ virtual StubLayout getStubLayout();
+
+ /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+ /// small native function that simply calls the function at the specified
+ /// address.
+ virtual void *emitFunctionStub(const Function* F, void *Target,
+ JITCodeEmitter &JCE);
+
+ /// getPICJumpTableEntry - Returns the value of the jumptable entry for the
+ /// specific basic block.
+ virtual uintptr_t getPICJumpTableEntry(uintptr_t BB, uintptr_t JTBase);
+
+ /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+ virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+ /// relocate - Before the JIT can run a block of code that has been emitted,
+ /// it must rewrite the code to contain the actual addresses of any
+ /// referenced global symbols.
+ virtual void relocate(void *Function, MachineRelocation *MR,
+ unsigned NumRelocs, unsigned char* GOTBase);
+
+ /// allocateThreadLocalMemory - Each target has its own way of
+ /// handling thread local variables. This method returns a value only
+ /// meaningful to the target.
+ virtual char* allocateThreadLocalMemory(size_t size);
+
+ /// setPICBase / getPICBase - Getter / setter of PICBase, used to compute
+ /// PIC jumptable entry.
+ void setPICBase(uintptr_t Base) { PICBase = Base; }
+ uintptr_t getPICBase() const { return PICBase; }
+ };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/X86MCAsmInfo.cpp
new file mode 100644
index 0000000..2b8720b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MCAsmInfo.cpp
@@ -0,0 +1,110 @@
+//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCAsmInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+enum AsmWriterFlavorTy {
+ // Note: This numbering has to match the GCC assembler dialects for inline
+ // asm alternatives to work right.
+ ATT = 0, Intel = 1
+};
+
+static cl::opt<AsmWriterFlavorTy>
+AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
+ cl::desc("Choose style of code to emit from X86 backend:"),
+ cl::values(clEnumValN(ATT, "att", "Emit AT&T-style assembly"),
+ clEnumValN(Intel, "intel", "Emit Intel-style assembly"),
+ clEnumValEnd));
+
+
+static const char *const x86_asm_table[] = {
+ "{si}", "S",
+ "{di}", "D",
+ "{ax}", "a",
+ "{cx}", "c",
+ "{memory}", "memory",
+ "{flags}", "",
+ "{dirflag}", "",
+ "{fpsr}", "",
+ "{cc}", "cc",
+ 0,0};
+
+X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &Triple) {
+ AsmTransCBE = x86_asm_table;
+ AssemblerDialect = AsmWriterFlavor;
+
+ bool is64Bit = Triple.getArch() == Triple::x86_64;
+
+ TextAlignFillValue = 0x90;
+
+ if (!is64Bit)
+ Data64bitsDirective = 0; // we can't emit a 64-bit unit
+
+ // Use ## as a comment string so that .s files generated by llvm can go
+ // through the GCC preprocessor without causing an error. This is needed
+ // because "clang foo.s" runs the C preprocessor, which is usually reserved
+ // for .S files on other systems. Perhaps this is because the file system
+ // wasn't always case preserving or something.
+ CommentString = "##";
+ PCSymbol = ".";
+
+ SupportsDebugInformation = true;
+ DwarfUsesInlineInfoSection = true;
+
+ // Exceptions handling
+ ExceptionsType = ExceptionHandling::Dwarf;
+}
+
+X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
+ AsmTransCBE = x86_asm_table;
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+
+ PrivateGlobalPrefix = ".L";
+ WeakRefDirective = "\t.weak\t";
+ PCSymbol = ".";
+
+ // Set up DWARF directives
+ HasLEB128 = true; // Target asm supports leb128 directives (little-endian)
+
+ // Debug Information
+ SupportsDebugInformation = true;
+
+ // Exceptions handling
+ ExceptionsType = ExceptionHandling::Dwarf;
+
+ // OpenBSD has buggy support for .quad in 32-bit mode, just split into two
+ // .words.
+ if (T.getOS() == Triple::OpenBSD && T.getArch() == Triple::x86)
+ Data64bitsDirective = 0;
+}
+
+const MCSection *X86ELFMCAsmInfo::
+getNonexecutableStackSection(MCContext &Ctx) const {
+ return Ctx.getELFSection(".note.GNU-stack", MCSectionELF::SHT_PROGBITS,
+ 0, SectionKind::getMetadata(), false);
+}
+
+X86MCAsmInfoCOFF::X86MCAsmInfoCOFF(const Triple &Triple) {
+ AsmTransCBE = x86_asm_table;
+ AssemblerDialect = AsmWriterFlavor;
+
+ TextAlignFillValue = 0x90;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/X86MCAsmInfo.h
new file mode 100644
index 0000000..5815225
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MCAsmInfo.h
@@ -0,0 +1,38 @@
+//=====-- X86MCAsmInfo.h - X86 asm properties -----------------*- C++ -*--====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETASMINFO_H
+#define X86TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+ class Triple;
+
+ struct X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
+ explicit X86MCAsmInfoDarwin(const Triple &Triple);
+ };
+
+ struct X86ELFMCAsmInfo : public MCAsmInfo {
+ explicit X86ELFMCAsmInfo(const Triple &Triple);
+ virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const;
+ };
+
+ struct X86MCAsmInfoCOFF : public MCAsmInfoCOFF {
+ explicit X86MCAsmInfoCOFF(const Triple &Triple);
+ };
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/X86MCCodeEmitter.cpp
new file mode 100644
index 0000000..a9681e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MCCodeEmitter.cpp
@@ -0,0 +1,659 @@
+//===-- X86/X86MCCodeEmitter.cpp - Convert X86 code to machine code -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-emitter"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86FixupKinds.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+class X86MCCodeEmitter : public MCCodeEmitter {
+ X86MCCodeEmitter(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
+ void operator=(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
+ const TargetMachine &TM;
+ const TargetInstrInfo &TII;
+ MCContext &Ctx;
+ bool Is64BitMode;
+public:
+ X86MCCodeEmitter(TargetMachine &tm, MCContext &ctx, bool is64Bit)
+ : TM(tm), TII(*TM.getInstrInfo()), Ctx(ctx) {
+ Is64BitMode = is64Bit;
+ }
+
+ ~X86MCCodeEmitter() {}
+
+ unsigned getNumFixupKinds() const {
+ return 4;
+ }
+
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+ const static MCFixupKindInfo Infos[] = {
+ { "reloc_pcrel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel },
+ { "reloc_pcrel_1byte", 0, 1 * 8, MCFixupKindInfo::FKF_IsPCRel },
+ { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel },
+ { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel }
+ };
+
+ if (Kind < FirstTargetFixupKind)
+ return MCCodeEmitter::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+ return Infos[Kind - FirstTargetFixupKind];
+ }
+
+ static unsigned GetX86RegNum(const MCOperand &MO) {
+ return X86RegisterInfo::getX86RegNum(MO.getReg());
+ }
+
+ void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
+ OS << (char)C;
+ ++CurByte;
+ }
+
+ void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+ raw_ostream &OS) const {
+ // Output the constant in little endian byte order.
+ for (unsigned i = 0; i != Size; ++i) {
+ EmitByte(Val & 255, CurByte, OS);
+ Val >>= 8;
+ }
+ }
+
+ void EmitImmediate(const MCOperand &Disp,
+ unsigned ImmSize, MCFixupKind FixupKind,
+ unsigned &CurByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ int ImmOffset = 0) const;
+
+ inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+ unsigned RM) {
+ assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+ return RM | (RegOpcode << 3) | (Mod << 6);
+ }
+
+ void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
+ unsigned &CurByte, raw_ostream &OS) const {
+ EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS);
+ }
+
+ void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+ unsigned &CurByte, raw_ostream &OS) const {
+ // SIB byte is in the same format as the ModRMByte.
+ EmitByte(ModRMByte(SS, Index, Base), CurByte, OS);
+ }
+
+
+ void EmitMemModRMByte(const MCInst &MI, unsigned Op,
+ unsigned RegOpcodeField,
+ unsigned TSFlags, unsigned &CurByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups) const;
+
+ void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups) const;
+
+};
+
+} // end anonymous namespace
+
+
+MCCodeEmitter *llvm::createX86_32MCCodeEmitter(const Target &,
+ TargetMachine &TM,
+ MCContext &Ctx) {
+ return new X86MCCodeEmitter(TM, Ctx, false);
+}
+
+MCCodeEmitter *llvm::createX86_64MCCodeEmitter(const Target &,
+ TargetMachine &TM,
+ MCContext &Ctx) {
+ return new X86MCCodeEmitter(TM, Ctx, true);
+}
+
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
+static bool isDisp8(int Value) {
+ return Value == (signed char)Value;
+}
+
+/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
+/// in an instruction with the specified TSFlags.
+static MCFixupKind getImmFixupKind(unsigned TSFlags) {
+ unsigned Size = X86II::getSizeOfImm(TSFlags);
+ bool isPCRel = X86II::isImmPCRel(TSFlags);
+
+ switch (Size) {
+ default: assert(0 && "Unknown immediate size");
+ case 1: return isPCRel ? MCFixupKind(X86::reloc_pcrel_1byte) : FK_Data_1;
+ case 4: return isPCRel ? MCFixupKind(X86::reloc_pcrel_4byte) : FK_Data_4;
+ case 2: assert(!isPCRel); return FK_Data_2;
+ case 8: assert(!isPCRel); return FK_Data_8;
+ }
+}
+
+
+void X86MCCodeEmitter::
+EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind,
+ unsigned &CurByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
+ // If this is a simple integer displacement that doesn't require a relocation,
+ // emit it now.
+ if (DispOp.isImm()) {
+ // FIXME: is this right for pc-rel encoding?? Probably need to emit this as
+ // a fixup if so.
+ EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS);
+ return;
+ }
+
+ // If we have an immoffset, add it to the expression.
+ const MCExpr *Expr = DispOp.getExpr();
+
+ // If the fixup is pc-relative, we need to bias the value to be relative to
+ // the start of the field, not the end of the field.
+ if (FixupKind == MCFixupKind(X86::reloc_pcrel_4byte) ||
+ FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
+ FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load))
+ ImmOffset -= 4;
+ if (FixupKind == MCFixupKind(X86::reloc_pcrel_1byte))
+ ImmOffset -= 1;
+
+ if (ImmOffset)
+ Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(ImmOffset, Ctx),
+ Ctx);
+
+ // Emit a symbolic constant as a fixup and 4 zeros.
+ Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind));
+ EmitConstant(0, Size, CurByte, OS);
+}
+
+
+void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
+ unsigned RegOpcodeField,
+ unsigned TSFlags, unsigned &CurByte,
+ raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups) const{
+ const MCOperand &Disp = MI.getOperand(Op+3);
+ const MCOperand &Base = MI.getOperand(Op);
+ const MCOperand &Scale = MI.getOperand(Op+1);
+ const MCOperand &IndexReg = MI.getOperand(Op+2);
+ unsigned BaseReg = Base.getReg();
+
+ // Handle %rip relative addressing.
+ if (BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode
+ assert(IndexReg.getReg() == 0 && Is64BitMode &&
+ "Invalid rip-relative address");
+ EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+
+ unsigned FixupKind = X86::reloc_riprel_4byte;
+
+ // movq loads are handled with a special relocation form which allows the
+ // linker to eliminate some loads for GOT references which end up in the
+ // same linkage unit.
+ if (MI.getOpcode() == X86::MOV64rm ||
+ MI.getOpcode() == X86::MOV64rm_TC)
+ FixupKind = X86::reloc_riprel_4byte_movq_load;
+
+ // rip-relative addressing is actually relative to the *next* instruction.
+ // Since an immediate can follow the mod/rm byte for an instruction, this
+ // means that we need to bias the immediate field of the instruction with
+ // the size of the immediate field. If we have this case, add it into the
+ // expression to emit.
+ int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+
+ EmitImmediate(Disp, 4, MCFixupKind(FixupKind),
+ CurByte, OS, Fixups, -ImmSize);
+ return;
+ }
+
+ unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U;
+
+ // Determine whether a SIB byte is needed.
+ // If no BaseReg, issue a RIP relative instruction only if the MCE can
+ // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+ // 2-7) and absolute references.
+
+ if (// The SIB byte must be used if there is an index register.
+ IndexReg.getReg() == 0 &&
+ // The SIB byte must be used if the base is ESP/RSP/R12, all of which
+ // encode to an R/M value of 4, which indicates that a SIB byte is
+ // present.
+ BaseRegNo != N86::ESP &&
+ // If there is no base register and we're in 64-bit mode, we need a SIB
+ // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+ (!Is64BitMode || BaseReg != 0)) {
+
+ if (BaseReg == 0) { // [disp32] in X86-32 mode
+ EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+ EmitImmediate(Disp, 4, FK_Data_4, CurByte, OS, Fixups);
+ return;
+ }
+
+ // If the base is not EBP/ESP and there is no displacement, use simple
+ // indirect register encoding, this handles addresses like [EAX]. The
+ // encoding for [EBP] with no displacement means [disp32] so we handle it
+ // by emitting a displacement of 0 below.
+ if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+ EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+ return;
+ }
+
+ // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+ if (Disp.isImm() && isDisp8(Disp.getImm())) {
+ EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+ EmitImmediate(Disp, 1, FK_Data_1, CurByte, OS, Fixups);
+ return;
+ }
+
+ // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+ EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
+ EmitImmediate(Disp, 4, FK_Data_4, CurByte, OS, Fixups);
+ return;
+ }
+
+ // We need a SIB byte, so start by outputting the ModR/M byte first
+ assert(IndexReg.getReg() != X86::ESP &&
+ IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+ bool ForceDisp32 = false;
+ bool ForceDisp8 = false;
+ if (BaseReg == 0) {
+ // If there is no base register, we emit the special case SIB byte with
+ // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+ EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp32 = true;
+ } else if (!Disp.isImm()) {
+ // Emit the normal disp32 encoding.
+ EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp32 = true;
+ } else if (Disp.getImm() == 0 &&
+ // Base reg can't be anything that ends up with '5' as the base
+ // reg, it is the magic [*] nomenclature that indicates no base.
+ BaseRegNo != N86::EBP) {
+ // Emit no displacement ModR/M byte
+ EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+ } else if (isDisp8(Disp.getImm())) {
+ // Emit the disp8 encoding.
+ EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ } else {
+ // Emit the normal disp32 encoding.
+ EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+ }
+
+ // Calculate what the SS field value should be...
+ static const unsigned SSTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 };
+ unsigned SS = SSTable[Scale.getImm()];
+
+ if (BaseReg == 0) {
+ // Handle the SIB byte for the case where there is no base, see Intel
+ // Manual 2A, table 2-7. The displacement has already been output.
+ unsigned IndexRegNo;
+ if (IndexReg.getReg())
+ IndexRegNo = GetX86RegNum(IndexReg);
+ else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+ IndexRegNo = 4;
+ EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
+ } else {
+ unsigned IndexRegNo;
+ if (IndexReg.getReg())
+ IndexRegNo = GetX86RegNum(IndexReg);
+ else
+ IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
+ EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS);
+ }
+
+ // Do we need to output a displacement?
+ if (ForceDisp8)
+ EmitImmediate(Disp, 1, FK_Data_1, CurByte, OS, Fixups);
+ else if (ForceDisp32 || Disp.getImm() != 0)
+ EmitImmediate(Disp, 4, FK_Data_4, CurByte, OS, Fixups);
+}
+
+/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+static unsigned DetermineREXPrefix(const MCInst &MI, unsigned TSFlags,
+ const TargetInstrDesc &Desc) {
+ // Pseudo instructions never have a rex byte.
+ if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+ return 0;
+
+ unsigned REX = 0;
+ if (TSFlags & X86II::REX_W)
+ REX |= 1 << 3;
+
+ if (MI.getNumOperands() == 0) return REX;
+
+ unsigned NumOps = MI.getNumOperands();
+ // FIXME: MCInst should explicitize the two-addrness.
+ bool isTwoAddr = NumOps > 1 &&
+ Desc.getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+ // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+ unsigned i = isTwoAddr ? 1 : 0;
+ for (; i != NumOps; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (!MO.isReg()) continue;
+ unsigned Reg = MO.getReg();
+ if (!X86InstrInfo::isX86_64NonExtLowByteReg(Reg)) continue;
+ // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
+ // that returns non-zero.
+ REX |= 0x40;
+ break;
+ }
+
+ switch (TSFlags & X86II::FormMask) {
+ case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!");
+ case X86II::MRMSrcReg:
+ if (MI.getOperand(0).isReg() &&
+ X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+ REX |= 1 << 2;
+ i = isTwoAddr ? 2 : 1;
+ for (; i != NumOps; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (MO.isReg() && X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
+ REX |= 1 << 0;
+ }
+ break;
+ case X86II::MRMSrcMem: {
+ if (MI.getOperand(0).isReg() &&
+ X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+ REX |= 1 << 2;
+ unsigned Bit = 0;
+ i = isTwoAddr ? 2 : 1;
+ for (; i != NumOps; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ if (X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
+ REX |= 1 << Bit;
+ Bit++;
+ }
+ }
+ break;
+ }
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m:
+ case X86II::MRMDestMem: {
+ unsigned e = (isTwoAddr ? X86AddrNumOperands+1 : X86AddrNumOperands);
+ i = isTwoAddr ? 1 : 0;
+ if (NumOps > e && MI.getOperand(e).isReg() &&
+ X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(e).getReg()))
+ REX |= 1 << 2;
+ unsigned Bit = 0;
+ for (; i != e; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (MO.isReg()) {
+ if (X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
+ REX |= 1 << Bit;
+ Bit++;
+ }
+ }
+ break;
+ }
+ default:
+ if (MI.getOperand(0).isReg() &&
+ X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+ REX |= 1 << 0;
+ i = isTwoAddr ? 2 : 1;
+ for (unsigned e = NumOps; i != e; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (MO.isReg() && X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
+ REX |= 1 << 2;
+ }
+ break;
+ }
+ return REX;
+}
+
+void X86MCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups) const {
+ unsigned Opcode = MI.getOpcode();
+ const TargetInstrDesc &Desc = TII.get(Opcode);
+ unsigned TSFlags = Desc.TSFlags;
+
+ // Keep track of the current byte being emitted.
+ unsigned CurByte = 0;
+
+ // FIXME: We should emit the prefixes in exactly the same order as GAS does,
+ // in order to provide diffability.
+
+ // Emit the lock opcode prefix as needed.
+ if (TSFlags & X86II::LOCK)
+ EmitByte(0xF0, CurByte, OS);
+
+ // Emit segment override opcode prefix as needed.
+ switch (TSFlags & X86II::SegOvrMask) {
+ default: assert(0 && "Invalid segment!");
+ case 0: break; // No segment override!
+ case X86II::FS:
+ EmitByte(0x64, CurByte, OS);
+ break;
+ case X86II::GS:
+ EmitByte(0x65, CurByte, OS);
+ break;
+ }
+
+ // Emit the repeat opcode prefix as needed.
+ if ((TSFlags & X86II::Op0Mask) == X86II::REP)
+ EmitByte(0xF3, CurByte, OS);
+
+ // Emit the operand size opcode prefix as needed.
+ if (TSFlags & X86II::OpSize)
+ EmitByte(0x66, CurByte, OS);
+
+ // Emit the address size opcode prefix as needed.
+ if (TSFlags & X86II::AdSize)
+ EmitByte(0x67, CurByte, OS);
+
+ bool Need0FPrefix = false;
+ switch (TSFlags & X86II::Op0Mask) {
+ default: assert(0 && "Invalid prefix!");
+ case 0: break; // No prefix!
+ case X86II::REP: break; // already handled.
+ case X86II::TB: // Two-byte opcode prefix
+ case X86II::T8: // 0F 38
+ case X86II::TA: // 0F 3A
+ Need0FPrefix = true;
+ break;
+ case X86II::TF: // F2 0F 38
+ EmitByte(0xF2, CurByte, OS);
+ Need0FPrefix = true;
+ break;
+ case X86II::XS: // F3 0F
+ EmitByte(0xF3, CurByte, OS);
+ Need0FPrefix = true;
+ break;
+ case X86II::XD: // F2 0F
+ EmitByte(0xF2, CurByte, OS);
+ Need0FPrefix = true;
+ break;
+ case X86II::D8: EmitByte(0xD8, CurByte, OS); break;
+ case X86II::D9: EmitByte(0xD9, CurByte, OS); break;
+ case X86II::DA: EmitByte(0xDA, CurByte, OS); break;
+ case X86II::DB: EmitByte(0xDB, CurByte, OS); break;
+ case X86II::DC: EmitByte(0xDC, CurByte, OS); break;
+ case X86II::DD: EmitByte(0xDD, CurByte, OS); break;
+ case X86II::DE: EmitByte(0xDE, CurByte, OS); break;
+ case X86II::DF: EmitByte(0xDF, CurByte, OS); break;
+ }
+
+ // Handle REX prefix.
+ // FIXME: Can this come before F2 etc to simplify emission?
+ if (Is64BitMode) {
+ if (unsigned REX = DetermineREXPrefix(MI, TSFlags, Desc))
+ EmitByte(0x40 | REX, CurByte, OS);
+ }
+
+ // 0x0F escape code must be emitted just before the opcode.
+ if (Need0FPrefix)
+ EmitByte(0x0F, CurByte, OS);
+
+ // FIXME: Pull this up into previous switch if REX can be moved earlier.
+ switch (TSFlags & X86II::Op0Mask) {
+ case X86II::TF: // F2 0F 38
+ case X86II::T8: // 0F 38
+ EmitByte(0x38, CurByte, OS);
+ break;
+ case X86II::TA: // 0F 3A
+ EmitByte(0x3A, CurByte, OS);
+ break;
+ }
+
+ // If this is a two-address instruction, skip one of the register operands.
+ unsigned NumOps = Desc.getNumOperands();
+ unsigned CurOp = 0;
+ if (NumOps > 1 && Desc.getOperandConstraint(1, TOI::TIED_TO) != -1)
+ ++CurOp;
+ else if (NumOps > 2 && Desc.getOperandConstraint(NumOps-1, TOI::TIED_TO)== 0)
+ // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
+ --NumOps;
+
+ unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+ switch (TSFlags & X86II::FormMask) {
+ case X86II::MRMInitReg:
+ assert(0 && "FIXME: Remove this form when the JIT moves to MCCodeEmitter!");
+ default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n";
+ assert(0 && "Unknown FormMask value in X86MCCodeEmitter!");
+ case X86II::Pseudo: return; // Pseudo instructions encode to nothing.
+ case X86II::RawFrm:
+ EmitByte(BaseOpcode, CurByte, OS);
+ break;
+
+ case X86II::AddRegFrm:
+ EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
+ break;
+
+ case X86II::MRMDestReg:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitRegModRMByte(MI.getOperand(CurOp),
+ GetX86RegNum(MI.getOperand(CurOp+1)), CurByte, OS);
+ CurOp += 2;
+ break;
+
+ case X86II::MRMDestMem:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitMemModRMByte(MI, CurOp,
+ GetX86RegNum(MI.getOperand(CurOp + X86AddrNumOperands)),
+ TSFlags, CurByte, OS, Fixups);
+ CurOp += X86AddrNumOperands + 1;
+ break;
+
+ case X86II::MRMSrcReg:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitRegModRMByte(MI.getOperand(CurOp+1), GetX86RegNum(MI.getOperand(CurOp)),
+ CurByte, OS);
+ CurOp += 2;
+ break;
+
+ case X86II::MRMSrcMem: {
+ EmitByte(BaseOpcode, CurByte, OS);
+
+ // FIXME: Maybe lea should have its own form? This is a horrible hack.
+ int AddrOperands;
+ if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+ Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+ AddrOperands = X86AddrNumOperands - 1; // No segment register
+ else
+ AddrOperands = X86AddrNumOperands;
+
+ EmitMemModRMByte(MI, CurOp+1, GetX86RegNum(MI.getOperand(CurOp)),
+ TSFlags, CurByte, OS, Fixups);
+ CurOp += AddrOperands + 1;
+ break;
+ }
+
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitRegModRMByte(MI.getOperand(CurOp++),
+ (TSFlags & X86II::FormMask)-X86II::MRM0r,
+ CurByte, OS);
+ break;
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitMemModRMByte(MI, CurOp, (TSFlags & X86II::FormMask)-X86II::MRM0m,
+ TSFlags, CurByte, OS, Fixups);
+ CurOp += X86AddrNumOperands;
+ break;
+ case X86II::MRM_C1:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitByte(0xC1, CurByte, OS);
+ break;
+ case X86II::MRM_C2:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitByte(0xC2, CurByte, OS);
+ break;
+ case X86II::MRM_C3:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitByte(0xC3, CurByte, OS);
+ break;
+ case X86II::MRM_C4:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitByte(0xC4, CurByte, OS);
+ break;
+ case X86II::MRM_C8:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitByte(0xC8, CurByte, OS);
+ break;
+ case X86II::MRM_C9:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitByte(0xC9, CurByte, OS);
+ break;
+ case X86II::MRM_E8:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitByte(0xE8, CurByte, OS);
+ break;
+ case X86II::MRM_F0:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitByte(0xF0, CurByte, OS);
+ break;
+ case X86II::MRM_F8:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitByte(0xF8, CurByte, OS);
+ break;
+ case X86II::MRM_F9:
+ EmitByte(BaseOpcode, CurByte, OS);
+ EmitByte(0xF9, CurByte, OS);
+ break;
+ }
+
+ // If there is a remaining operand, it must be a trailing immediate. Emit it
+ // according to the right size for the instruction.
+ if (CurOp != NumOps)
+ EmitImmediate(MI.getOperand(CurOp++),
+ X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+ CurByte, OS, Fixups);
+
+#ifndef NDEBUG
+ // FIXME: Verify.
+ if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
+ errs() << "Cannot encode all operands of: ";
+ MI.dump();
+ errs() << '\n';
+ abort();
+ }
+#endif
+}
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
new file mode 100644
index 0000000..06043ec
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -0,0 +1,135 @@
+//====- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares X86-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86MACHINEFUNCTIONINFO_H
+#define X86MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// X86MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private X86 target-specific information for each MachineFunction.
+class X86MachineFunctionInfo : public MachineFunctionInfo {
+ /// ForceFramePointer - True if the function is required to use of frame
+ /// pointer for reasons other than it containing dynamic allocation or
+ /// that FP eliminatation is turned off. For example, Cygwin main function
+ /// contains stack pointer re-alignment code which requires FP.
+ bool ForceFramePointer;
+
+ /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+ /// stack frame in bytes.
+ unsigned CalleeSavedFrameSize;
+
+ /// BytesToPopOnReturn - Number of bytes function pops on return (in addition
+ /// to the space used by the return address).
+ /// Used on windows platform for stdcall & fastcall name decoration
+ unsigned BytesToPopOnReturn;
+
+ /// ReturnAddrIndex - FrameIndex for return slot.
+ int ReturnAddrIndex;
+
+ /// TailCallReturnAddrDelta - The number of bytes by which return address
+ /// stack slot is moved as the result of tail call optimization.
+ int TailCallReturnAddrDelta;
+
+ /// SRetReturnReg - Some subtargets require that sret lowering includes
+ /// returning the value of the returned struct in a register. This field
+ /// holds the virtual register into which the sret argument is passed.
+ unsigned SRetReturnReg;
+
+ /// GlobalBaseReg - keeps track of the virtual register initialized for
+ /// use as the global base register. This is used for PIC in some PIC
+ /// relocation models.
+ unsigned GlobalBaseReg;
+
+ /// ReserveFP - whether the function should reserve the frame pointer
+ /// when allocating, even if there may not actually be a frame pointer used.
+ bool ReserveFP;
+
+ /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+ int VarArgsFrameIndex;
+ /// RegSaveFrameIndex - X86-64 vararg func register save area.
+ int RegSaveFrameIndex;
+ /// VarArgsGPOffset - X86-64 vararg func int reg offset.
+ unsigned VarArgsGPOffset;
+ /// VarArgsFPOffset - X86-64 vararg func fp reg offset.
+ unsigned VarArgsFPOffset;
+
+public:
+ X86MachineFunctionInfo() : ForceFramePointer(false),
+ CalleeSavedFrameSize(0),
+ BytesToPopOnReturn(0),
+ ReturnAddrIndex(0),
+ TailCallReturnAddrDelta(0),
+ SRetReturnReg(0),
+ GlobalBaseReg(0),
+ VarArgsFrameIndex(0),
+ RegSaveFrameIndex(0),
+ VarArgsGPOffset(0),
+ VarArgsFPOffset(0) {}
+
+ explicit X86MachineFunctionInfo(MachineFunction &MF)
+ : ForceFramePointer(false),
+ CalleeSavedFrameSize(0),
+ BytesToPopOnReturn(0),
+ ReturnAddrIndex(0),
+ TailCallReturnAddrDelta(0),
+ SRetReturnReg(0),
+ GlobalBaseReg(0),
+ ReserveFP(false),
+ VarArgsFrameIndex(0),
+ RegSaveFrameIndex(0),
+ VarArgsGPOffset(0),
+ VarArgsFPOffset(0) {}
+
+ bool getForceFramePointer() const { return ForceFramePointer;}
+ void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+
+ unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+ void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+ unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+ void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
+
+ int getRAIndex() const { return ReturnAddrIndex; }
+ void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+ int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
+ void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
+
+ unsigned getSRetReturnReg() const { return SRetReturnReg; }
+ void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+ unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+ void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+ bool getReserveFP() const { return ReserveFP; }
+ void setReserveFP(bool reserveFP) { ReserveFP = reserveFP; }
+
+ int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+ void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
+
+ int getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
+ void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; }
+
+ unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; }
+ void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; }
+
+ unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; }
+ void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
new file mode 100644
index 0000000..98975ea
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -0,0 +1,1573 @@
+//===- X86RegisterInfo.cpp - X86 Register Information -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+// This file is responsible for the frame pointer elimination optimization
+// on X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86RegisterInfo.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
+ const TargetInstrInfo &tii)
+ : X86GenRegisterInfo(tm.getSubtarget<X86Subtarget>().is64Bit() ?
+ X86::ADJCALLSTACKDOWN64 :
+ X86::ADJCALLSTACKDOWN32,
+ tm.getSubtarget<X86Subtarget>().is64Bit() ?
+ X86::ADJCALLSTACKUP64 :
+ X86::ADJCALLSTACKUP32),
+ TM(tm), TII(tii) {
+ // Cache some information.
+ const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+ Is64Bit = Subtarget->is64Bit();
+ IsWin64 = Subtarget->isTargetWin64();
+ StackAlign = TM.getFrameInfo()->getStackAlignment();
+
+ if (Is64Bit) {
+ SlotSize = 8;
+ StackPtr = X86::RSP;
+ FramePtr = X86::RBP;
+ } else {
+ SlotSize = 4;
+ StackPtr = X86::ESP;
+ FramePtr = X86::EBP;
+ }
+}
+
+/// getDwarfRegNum - This function maps LLVM register identifiers to the DWARF
+/// specific numbering, used in debug info and exception tables.
+int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
+ const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+ unsigned Flavour = DWARFFlavour::X86_64;
+
+ if (!Subtarget->is64Bit()) {
+ if (Subtarget->isTargetDarwin()) {
+ if (isEH)
+ Flavour = DWARFFlavour::X86_32_DarwinEH;
+ else
+ Flavour = DWARFFlavour::X86_32_Generic;
+ } else if (Subtarget->isTargetCygMing()) {
+ // Unsupported by now, just quick fallback
+ Flavour = DWARFFlavour::X86_32_Generic;
+ } else {
+ Flavour = DWARFFlavour::X86_32_Generic;
+ }
+ }
+
+ return X86GenRegisterInfo::getDwarfRegNumFull(RegNo, Flavour);
+}
+
+/// getX86RegNum - This function maps LLVM register identifiers to their X86
+/// specific numbering, which is used in various places encoding instructions.
+unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) {
+ switch(RegNo) {
+ case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
+ case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX;
+ case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX;
+ case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX;
+ case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH:
+ return N86::ESP;
+ case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH:
+ return N86::EBP;
+ case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH:
+ return N86::ESI;
+ case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH:
+ return N86::EDI;
+
+ case X86::R8: case X86::R8D: case X86::R8W: case X86::R8B:
+ return N86::EAX;
+ case X86::R9: case X86::R9D: case X86::R9W: case X86::R9B:
+ return N86::ECX;
+ case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
+ return N86::EDX;
+ case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
+ return N86::EBX;
+ case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
+ return N86::ESP;
+ case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
+ return N86::EBP;
+ case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
+ return N86::ESI;
+ case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
+ return N86::EDI;
+
+ case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3:
+ case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7:
+ return RegNo-X86::ST0;
+
+ case X86::XMM0: case X86::XMM8: case X86::MM0:
+ return 0;
+ case X86::XMM1: case X86::XMM9: case X86::MM1:
+ return 1;
+ case X86::XMM2: case X86::XMM10: case X86::MM2:
+ return 2;
+ case X86::XMM3: case X86::XMM11: case X86::MM3:
+ return 3;
+ case X86::XMM4: case X86::XMM12: case X86::MM4:
+ return 4;
+ case X86::XMM5: case X86::XMM13: case X86::MM5:
+ return 5;
+ case X86::XMM6: case X86::XMM14: case X86::MM6:
+ return 6;
+ case X86::XMM7: case X86::XMM15: case X86::MM7:
+ return 7;
+
+ case X86::ES:
+ return 0;
+ case X86::CS:
+ return 1;
+ case X86::SS:
+ return 2;
+ case X86::DS:
+ return 3;
+ case X86::FS:
+ return 4;
+ case X86::GS:
+ return 5;
+
+ default:
+ assert(isVirtualRegister(RegNo) && "Unknown physical register!");
+ llvm_unreachable("Register allocator hasn't allocated reg correctly yet!");
+ return 0;
+ }
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+ const TargetRegisterClass *B,
+ unsigned SubIdx) const {
+ switch (SubIdx) {
+ default: return 0;
+ case X86::sub_8bit:
+ if (B == &X86::GR8RegClass) {
+ if (A->getSize() == 2 || A->getSize() == 4 || A->getSize() == 8)
+ return A;
+ } else if (B == &X86::GR8_ABCD_LRegClass || B == &X86::GR8_ABCD_HRegClass) {
+ if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
+ A == &X86::GR64_NOREXRegClass ||
+ A == &X86::GR64_NOSPRegClass ||
+ A == &X86::GR64_NOREX_NOSPRegClass)
+ return &X86::GR64_ABCDRegClass;
+ else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass ||
+ A == &X86::GR32_NOREXRegClass ||
+ A == &X86::GR32_NOSPRegClass)
+ return &X86::GR32_ABCDRegClass;
+ else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass ||
+ A == &X86::GR16_NOREXRegClass)
+ return &X86::GR16_ABCDRegClass;
+ } else if (B == &X86::GR8_NOREXRegClass) {
+ if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass ||
+ A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
+ return &X86::GR64_NOREXRegClass;
+ else if (A == &X86::GR64_ABCDRegClass)
+ return &X86::GR64_ABCDRegClass;
+ else if (A == &X86::GR32RegClass || A == &X86::GR32_NOREXRegClass ||
+ A == &X86::GR32_NOSPRegClass)
+ return &X86::GR32_NOREXRegClass;
+ else if (A == &X86::GR32_ABCDRegClass)
+ return &X86::GR32_ABCDRegClass;
+ else if (A == &X86::GR16RegClass || A == &X86::GR16_NOREXRegClass)
+ return &X86::GR16_NOREXRegClass;
+ else if (A == &X86::GR16_ABCDRegClass)
+ return &X86::GR16_ABCDRegClass;
+ }
+ break;
+ case X86::sub_8bit_hi:
+ if (B == &X86::GR8_ABCD_HRegClass) {
+ if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
+ A == &X86::GR64_NOREXRegClass ||
+ A == &X86::GR64_NOSPRegClass ||
+ A == &X86::GR64_NOREX_NOSPRegClass)
+ return &X86::GR64_ABCDRegClass;
+ else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass ||
+ A == &X86::GR32_NOREXRegClass || A == &X86::GR32_NOSPRegClass)
+ return &X86::GR32_ABCDRegClass;
+ else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass ||
+ A == &X86::GR16_NOREXRegClass)
+ return &X86::GR16_ABCDRegClass;
+ }
+ break;
+ case X86::sub_16bit:
+ if (B == &X86::GR16RegClass) {
+ if (A->getSize() == 4 || A->getSize() == 8)
+ return A;
+ } else if (B == &X86::GR16_ABCDRegClass) {
+ if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
+ A == &X86::GR64_NOREXRegClass ||
+ A == &X86::GR64_NOSPRegClass ||
+ A == &X86::GR64_NOREX_NOSPRegClass)
+ return &X86::GR64_ABCDRegClass;
+ else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass ||
+ A == &X86::GR32_NOREXRegClass || A == &X86::GR32_NOSPRegClass)
+ return &X86::GR32_ABCDRegClass;
+ } else if (B == &X86::GR16_NOREXRegClass) {
+ if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass ||
+ A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
+ return &X86::GR64_NOREXRegClass;
+ else if (A == &X86::GR64_ABCDRegClass)
+ return &X86::GR64_ABCDRegClass;
+ else if (A == &X86::GR32RegClass || A == &X86::GR32_NOREXRegClass ||
+ A == &X86::GR32_NOSPRegClass)
+ return &X86::GR32_NOREXRegClass;
+ else if (A == &X86::GR32_ABCDRegClass)
+ return &X86::GR64_ABCDRegClass;
+ }
+ break;
+ case X86::sub_32bit:
+ if (B == &X86::GR32RegClass || B == &X86::GR32_NOSPRegClass) {
+ if (A->getSize() == 8)
+ return A;
+ } else if (B == &X86::GR32_ABCDRegClass) {
+ if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
+ A == &X86::GR64_NOREXRegClass ||
+ A == &X86::GR64_NOSPRegClass ||
+ A == &X86::GR64_NOREX_NOSPRegClass)
+ return &X86::GR64_ABCDRegClass;
+ } else if (B == &X86::GR32_NOREXRegClass) {
+ if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass ||
+ A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
+ return &X86::GR64_NOREXRegClass;
+ else if (A == &X86::GR64_ABCDRegClass)
+ return &X86::GR64_ABCDRegClass;
+ }
+ break;
+ case X86::sub_ss:
+ if (B == &X86::FR32RegClass)
+ return A;
+ break;
+ case X86::sub_sd:
+ if (B == &X86::FR64RegClass)
+ return A;
+ break;
+ case X86::sub_xmm:
+ if (B == &X86::VR128RegClass)
+ return A;
+ break;
+ }
+ return 0;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getPointerRegClass(unsigned Kind) const {
+ switch (Kind) {
+ default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
+ case 0: // Normal GPRs.
+ if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ return &X86::GR64RegClass;
+ return &X86::GR32RegClass;
+ case 1: // Normal GRPs except the stack pointer (for encoding reasons).
+ if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ return &X86::GR64_NOSPRegClass;
+ return &X86::GR32_NOSPRegClass;
+ }
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+ if (RC == &X86::CCRRegClass) {
+ if (Is64Bit)
+ return &X86::GR64RegClass;
+ else
+ return &X86::GR32RegClass;
+ }
+ return NULL;
+}
+
+const unsigned *
+X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+ bool callsEHReturn = false;
+ bool ghcCall = false;
+
+ if (MF) {
+ callsEHReturn = MF->getMMI().callsEHReturn();
+ const Function *F = MF->getFunction();
+ ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
+ }
+
+ static const unsigned GhcCalleeSavedRegs[] = {
+ 0
+ };
+
+ static const unsigned CalleeSavedRegs32Bit[] = {
+ X86::ESI, X86::EDI, X86::EBX, X86::EBP, 0
+ };
+
+ static const unsigned CalleeSavedRegs32EHRet[] = {
+ X86::EAX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP, 0
+ };
+
+ static const unsigned CalleeSavedRegs64Bit[] = {
+ X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+ };
+
+ static const unsigned CalleeSavedRegs64EHRet[] = {
+ X86::RAX, X86::RDX, X86::RBX, X86::R12,
+ X86::R13, X86::R14, X86::R15, X86::RBP, 0
+ };
+
+ static const unsigned CalleeSavedRegsWin64[] = {
+ X86::RBX, X86::RBP, X86::RDI, X86::RSI,
+ X86::R12, X86::R13, X86::R14, X86::R15,
+ X86::XMM6, X86::XMM7, X86::XMM8, X86::XMM9,
+ X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13,
+ X86::XMM14, X86::XMM15, 0
+ };
+
+ if (ghcCall) {
+ return GhcCalleeSavedRegs;
+ } else if (Is64Bit) {
+ if (IsWin64)
+ return CalleeSavedRegsWin64;
+ else
+ return (callsEHReturn ? CalleeSavedRegs64EHRet : CalleeSavedRegs64Bit);
+ } else {
+ return (callsEHReturn ? CalleeSavedRegs32EHRet : CalleeSavedRegs32Bit);
+ }
+}
+
+const TargetRegisterClass* const*
+X86RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+ bool callsEHReturn = false;
+ if (MF)
+ callsEHReturn = MF->getMMI().callsEHReturn();
+
+ static const TargetRegisterClass * const CalleeSavedRegClasses32Bit[] = {
+ &X86::GR32RegClass, &X86::GR32RegClass,
+ &X86::GR32RegClass, &X86::GR32RegClass, 0
+ };
+ static const TargetRegisterClass * const CalleeSavedRegClasses32EHRet[] = {
+ &X86::GR32RegClass, &X86::GR32RegClass,
+ &X86::GR32RegClass, &X86::GR32RegClass,
+ &X86::GR32RegClass, &X86::GR32RegClass, 0
+ };
+ static const TargetRegisterClass * const CalleeSavedRegClasses64Bit[] = {
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass, 0
+ };
+ static const TargetRegisterClass * const CalleeSavedRegClasses64EHRet[] = {
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass, 0
+ };
+ static const TargetRegisterClass * const CalleeSavedRegClassesWin64[] = {
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::VR128RegClass, &X86::VR128RegClass,
+ &X86::VR128RegClass, &X86::VR128RegClass,
+ &X86::VR128RegClass, &X86::VR128RegClass,
+ &X86::VR128RegClass, &X86::VR128RegClass,
+ &X86::VR128RegClass, &X86::VR128RegClass, 0
+ };
+
+ if (Is64Bit) {
+ if (IsWin64)
+ return CalleeSavedRegClassesWin64;
+ else
+ return (callsEHReturn ?
+ CalleeSavedRegClasses64EHRet : CalleeSavedRegClasses64Bit);
+ } else {
+ return (callsEHReturn ?
+ CalleeSavedRegClasses32EHRet : CalleeSavedRegClasses32Bit);
+ }
+}
+
+BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ // Set the stack-pointer register and its aliases as reserved.
+ Reserved.set(X86::RSP);
+ Reserved.set(X86::ESP);
+ Reserved.set(X86::SP);
+ Reserved.set(X86::SPL);
+
+ // Set the instruction pointer register and its aliases as reserved.
+ Reserved.set(X86::RIP);
+ Reserved.set(X86::EIP);
+ Reserved.set(X86::IP);
+
+ // Set the frame-pointer register and its aliases as reserved if needed.
+ if (hasFP(MF)) {
+ Reserved.set(X86::RBP);
+ Reserved.set(X86::EBP);
+ Reserved.set(X86::BP);
+ Reserved.set(X86::BPL);
+ }
+
+ // Mark the x87 stack registers as reserved, since they don't behave normally
+ // with respect to liveness. We don't fully model the effects of x87 stack
+ // pushes and pops after stackification.
+ Reserved.set(X86::ST0);
+ Reserved.set(X86::ST1);
+ Reserved.set(X86::ST2);
+ Reserved.set(X86::ST3);
+ Reserved.set(X86::ST4);
+ Reserved.set(X86::ST5);
+ Reserved.set(X86::ST6);
+ Reserved.set(X86::ST7);
+ return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register. This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+bool X86RegisterInfo::hasFP(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const MachineModuleInfo &MMI = MF.getMMI();
+
+ return (DisableFramePointerElim(MF) ||
+ needsStackRealignment(MF) ||
+ MFI->hasVarSizedObjects() ||
+ MFI->isFrameAddressTaken() ||
+ MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+ MMI.callsUnwindInit());
+}
+
+bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ return (RealignStack &&
+ !MFI->hasVarSizedObjects());
+}
+
+bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ const Function *F = MF.getFunction();
+ bool requiresRealignment =
+ RealignStack && ((MFI->getMaxAlignment() > StackAlign) ||
+ F->hasFnAttr(Attribute::StackAlignment));
+
+ // FIXME: Currently we don't support stack realignment for functions with
+ // variable-sized allocas.
+ // FIXME: Temporary disable the error - it seems to be too conservative.
+ if (0 && requiresRealignment && MFI->hasVarSizedObjects())
+ report_fatal_error(
+ "Stack realignment in presense of dynamic allocas is not supported");
+
+ return (requiresRealignment && !MFI->hasVarSizedObjects());
+}
+
+bool X86RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
+ return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+bool X86RegisterInfo::hasReservedSpillSlot(MachineFunction &MF, unsigned Reg,
+ int &FrameIdx) const {
+ if (Reg == FramePtr && hasFP(MF)) {
+ FrameIdx = MF.getFrameInfo()->getObjectIndexBegin();
+ return true;
+ }
+ return false;
+}
+
+int
+X86RegisterInfo::getFrameIndexOffset(const MachineFunction &MF, int FI) const {
+ const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ int Offset = MFI->getObjectOffset(FI) - TFI.getOffsetOfLocalArea();
+ uint64_t StackSize = MFI->getStackSize();
+
+ if (needsStackRealignment(MF)) {
+ if (FI < 0) {
+ // Skip the saved EBP.
+ Offset += SlotSize;
+ } else {
+ unsigned Align = MFI->getObjectAlignment(FI);
+ assert((-(Offset + StackSize)) % Align == 0);
+ Align = 0;
+ return Offset + StackSize;
+ }
+ // FIXME: Support tail calls
+ } else {
+ if (!hasFP(MF))
+ return Offset + StackSize;
+
+ // Skip the saved EBP.
+ Offset += SlotSize;
+
+ // Skip the RETADDR move area
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta < 0)
+ Offset -= TailCallReturnAddrDelta;
+ }
+
+ return Offset;
+}
+
+static unsigned getSUBriOpcode(unsigned is64Bit, int64_t Imm) {
+ if (is64Bit) {
+ if (isInt<8>(Imm))
+ return X86::SUB64ri8;
+ return X86::SUB64ri32;
+ } else {
+ if (isInt<8>(Imm))
+ return X86::SUB32ri8;
+ return X86::SUB32ri;
+ }
+}
+
+static unsigned getADDriOpcode(unsigned is64Bit, int64_t Imm) {
+ if (is64Bit) {
+ if (isInt<8>(Imm))
+ return X86::ADD64ri8;
+ return X86::ADD64ri32;
+ } else {
+ if (isInt<8>(Imm))
+ return X86::ADD32ri8;
+ return X86::ADD32ri;
+ }
+}
+
+void X86RegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ if (!hasReservedCallFrame(MF)) {
+ // If the stack pointer can be changed after prologue, turn the
+ // adjcallstackup instruction into a 'sub ESP, <amt>' and the
+ // adjcallstackdown instruction into 'add ESP, <amt>'
+ // TODO: consider using push / pop instead of sub + store / add
+ MachineInstr *Old = I;
+ uint64_t Amount = Old->getOperand(0).getImm();
+ if (Amount != 0) {
+ // We need to keep the stack aligned properly. To do this, we round the
+ // amount of space needed for the outgoing arguments up to the next
+ // alignment boundary.
+ Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
+
+ MachineInstr *New = 0;
+ if (Old->getOpcode() == getCallFrameSetupOpcode()) {
+ New = BuildMI(MF, Old->getDebugLoc(),
+ TII.get(getSUBriOpcode(Is64Bit, Amount)),
+ StackPtr)
+ .addReg(StackPtr)
+ .addImm(Amount);
+ } else {
+ assert(Old->getOpcode() == getCallFrameDestroyOpcode());
+
+ // Factor out the amount the callee already popped.
+ uint64_t CalleeAmt = Old->getOperand(1).getImm();
+ Amount -= CalleeAmt;
+
+ if (Amount) {
+ unsigned Opc = getADDriOpcode(Is64Bit, Amount);
+ New = BuildMI(MF, Old->getDebugLoc(), TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(Amount);
+ }
+ }
+
+ if (New) {
+ // The EFLAGS implicit def is dead.
+ New->getOperand(3).setIsDead();
+
+ // Replace the pseudo instruction with a new instruction.
+ MBB.insert(I, New);
+ }
+ }
+ } else if (I->getOpcode() == getCallFrameDestroyOpcode()) {
+ // If we are performing frame pointer elimination and if the callee pops
+ // something off the stack pointer, add it back. We do this until we have
+ // more advanced stack pointer tracking ability.
+ if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+ unsigned Opc = getSUBriOpcode(Is64Bit, CalleeAmt);
+ MachineInstr *Old = I;
+ MachineInstr *New =
+ BuildMI(MF, Old->getDebugLoc(), TII.get(Opc),
+ StackPtr)
+ .addReg(StackPtr)
+ .addImm(CalleeAmt);
+
+ // The EFLAGS implicit def is dead.
+ New->getOperand(3).setIsDead();
+ MBB.insert(I, New);
+ }
+ }
+
+ MBB.erase(I);
+}
+
+unsigned
+X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, FrameIndexValue *Value,
+ RegScavenger *RS) const{
+ assert(SPAdj == 0 && "Unexpected");
+
+ unsigned i = 0;
+ MachineInstr &MI = *II;
+ MachineFunction &MF = *MI.getParent()->getParent();
+
+ while (!MI.getOperand(i).isFI()) {
+ ++i;
+ assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+ }
+
+ int FrameIndex = MI.getOperand(i).getIndex();
+ unsigned BasePtr;
+
+ unsigned Opc = MI.getOpcode();
+ bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm;
+ if (needsStackRealignment(MF))
+ BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
+ else if (AfterFPPop)
+ BasePtr = StackPtr;
+ else
+ BasePtr = (hasFP(MF) ? FramePtr : StackPtr);
+
+ // This must be part of a four operand memory reference. Replace the
+ // FrameIndex with base register with EBP. Add an offset to the offset.
+ MI.getOperand(i).ChangeToRegister(BasePtr, false);
+
+ // Now add the frame object offset to the offset from EBP.
+ int FIOffset;
+ if (AfterFPPop) {
+ // Tail call jmp happens after FP is popped.
+ const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ FIOffset = MFI->getObjectOffset(FrameIndex) - TFI.getOffsetOfLocalArea();
+ } else
+ FIOffset = getFrameIndexOffset(MF, FrameIndex);
+
+ if (MI.getOperand(i+3).isImm()) {
+ // Offset is a 32-bit integer.
+ int Offset = FIOffset + (int)(MI.getOperand(i + 3).getImm());
+ MI.getOperand(i + 3).ChangeToImmediate(Offset);
+ } else {
+ // Offset is symbolic. This is extremely rare.
+ uint64_t Offset = FIOffset + (uint64_t)MI.getOperand(i+3).getOffset();
+ MI.getOperand(i+3).setOffset(Offset);
+ }
+ return 0;
+}
+
+void
+X86RegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+ RegScavenger *RS) const {
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+
+ if (TailCallReturnAddrDelta < 0) {
+ // create RETURNADDR area
+ // arg
+ // arg
+ // RETADDR
+ // { ...
+ // RETADDR area
+ // ...
+ // }
+ // [EBP]
+ MFI->CreateFixedObject(-TailCallReturnAddrDelta,
+ (-1U*SlotSize)+TailCallReturnAddrDelta,
+ true, false);
+ }
+
+ if (hasFP(MF)) {
+ assert((TailCallReturnAddrDelta <= 0) &&
+ "The Delta should always be zero or negative");
+ const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+
+ // Create a frame entry for the EBP register that must be saved.
+ int FrameIdx = MFI->CreateFixedObject(SlotSize,
+ -(int)SlotSize +
+ TFI.getOffsetOfLocalArea() +
+ TailCallReturnAddrDelta,
+ true, false);
+ assert(FrameIdx == MFI->getObjectIndexBegin() &&
+ "Slot for EBP register must be last in order to be found!");
+ FrameIdx = 0;
+ }
+}
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+static
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ unsigned StackPtr, int64_t NumBytes, bool Is64Bit,
+ const TargetInstrInfo &TII) {
+ bool isSub = NumBytes < 0;
+ uint64_t Offset = isSub ? -NumBytes : NumBytes;
+ unsigned Opc = isSub ?
+ getSUBriOpcode(Is64Bit, Offset) :
+ getADDriOpcode(Is64Bit, Offset);
+ uint64_t Chunk = (1LL << 31) - 1;
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+ while (Offset) {
+ uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(ThisVal);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+ Offset -= ThisVal;
+ }
+}
+
+/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
+static
+void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ unsigned StackPtr, uint64_t *NumBytes = NULL) {
+ if (MBBI == MBB.begin()) return;
+
+ MachineBasicBlock::iterator PI = prior(MBBI);
+ unsigned Opc = PI->getOpcode();
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ if (NumBytes)
+ *NumBytes += PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ if (NumBytes)
+ *NumBytes -= PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ }
+}
+
+/// mergeSPUpdatesUp - Merge two stack-manipulating instructions lower iterator.
+static
+void mergeSPUpdatesDown(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned StackPtr, uint64_t *NumBytes = NULL) {
+ // FIXME: THIS ISN'T RUN!!!
+ return;
+
+ if (MBBI == MBB.end()) return;
+
+ MachineBasicBlock::iterator NI = llvm::next(MBBI);
+ if (NI == MBB.end()) return;
+
+ unsigned Opc = NI->getOpcode();
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+ NI->getOperand(0).getReg() == StackPtr) {
+ if (NumBytes)
+ *NumBytes -= NI->getOperand(2).getImm();
+ MBB.erase(NI);
+ MBBI = NI;
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+ NI->getOperand(0).getReg() == StackPtr) {
+ if (NumBytes)
+ *NumBytes += NI->getOperand(2).getImm();
+ MBB.erase(NI);
+ MBBI = NI;
+ }
+}
+
+/// mergeSPUpdates - Checks the instruction before/after the passed
+/// instruction. If it is an ADD/SUB instruction it is deleted argument and the
+/// stack adjustment is returned as a positive value for ADD and a negative for
+/// SUB.
+static int mergeSPUpdates(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned StackPtr,
+ bool doMergeWithPrevious) {
+ if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
+ (!doMergeWithPrevious && MBBI == MBB.end()))
+ return 0;
+
+ MachineBasicBlock::iterator PI = doMergeWithPrevious ? prior(MBBI) : MBBI;
+ MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : llvm::next(MBBI);
+ unsigned Opc = PI->getOpcode();
+ int Offset = 0;
+
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr){
+ Offset += PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ if (!doMergeWithPrevious) MBBI = NI;
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ Offset -= PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ if (!doMergeWithPrevious) MBBI = NI;
+ }
+
+ return Offset;
+}
+
+void X86RegisterInfo::emitCalleeSavedFrameMoves(MachineFunction &MF,
+ MCSymbol *Label,
+ unsigned FramePtr) const {
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineModuleInfo &MMI = MF.getMMI();
+
+ // Add callee saved registers to move list.
+ const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+ if (CSI.empty()) return;
+
+ std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+ const TargetData *TD = MF.getTarget().getTargetData();
+ bool HasFP = hasFP(MF);
+
+ // Calculate amount of bytes used for return address storing.
+ int stackGrowth =
+ (MF.getTarget().getFrameInfo()->getStackGrowthDirection() ==
+ TargetFrameInfo::StackGrowsUp ?
+ TD->getPointerSize() : -TD->getPointerSize());
+
+ // FIXME: This is dirty hack. The code itself is pretty mess right now.
+ // It should be rewritten from scratch and generalized sometimes.
+
+ // Determine maximum offset (minumum due to stack growth).
+ int64_t MaxOffset = 0;
+ for (std::vector<CalleeSavedInfo>::const_iterator
+ I = CSI.begin(), E = CSI.end(); I != E; ++I)
+ MaxOffset = std::min(MaxOffset,
+ MFI->getObjectOffset(I->getFrameIdx()));
+
+ // Calculate offsets.
+ int64_t saveAreaOffset = (HasFP ? 3 : 2) * stackGrowth;
+ for (std::vector<CalleeSavedInfo>::const_iterator
+ I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+ int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+ unsigned Reg = I->getReg();
+ Offset = MaxOffset - Offset + saveAreaOffset;
+
+ // Don't output a new machine move if we're re-saving the frame
+ // pointer. This happens when the PrologEpilogInserter has inserted an extra
+ // "PUSH" of the frame pointer -- the "emitPrologue" method automatically
+ // generates one when frame pointers are used. If we generate a "machine
+ // move" for this extra "PUSH", the linker will lose track of the fact that
+ // the frame pointer should have the value of the first "PUSH" when it's
+ // trying to unwind.
+ //
+ // FIXME: This looks inelegant. It's possibly correct, but it's covering up
+ // another bug. I.e., one where we generate a prolog like this:
+ //
+ // pushl %ebp
+ // movl %esp, %ebp
+ // pushl %ebp
+ // pushl %esi
+ // ...
+ //
+ // The immediate re-push of EBP is unnecessary. At the least, it's an
+ // optimization bug. EBP can be used as a scratch register in certain
+ // cases, but probably not when we have a frame pointer.
+ if (HasFP && FramePtr == Reg)
+ continue;
+
+ MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+ MachineLocation CSSrc(Reg);
+ Moves.push_back(MachineMove(Label, CSDst, CSSrc));
+ }
+}
+
+/// emitPrologue - Push callee-saved registers onto the stack, which
+/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
+/// space for local variables. Also emit labels used by the exception handler to
+/// generate the exception handling frames.
+void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
+ MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ const Function *Fn = MF.getFunction();
+ const X86Subtarget *Subtarget = &MF.getTarget().getSubtarget<X86Subtarget>();
+ MachineModuleInfo &MMI = MF.getMMI();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ bool needsFrameMoves = MMI.hasDebugInfo() ||
+ !Fn->doesNotThrow() || UnwindTablesMandatory;
+ uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment.
+ uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate.
+ bool HasFP = hasFP(MF);
+ DebugLoc DL;
+
+ // Add RETADDR move area to callee saved frame size.
+ int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+ if (TailCallReturnAddrDelta < 0)
+ X86FI->setCalleeSavedFrameSize(
+ X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
+
+ // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
+ // function, and use up to 128 bytes of stack space, don't have a frame
+ // pointer, calls, or dynamic alloca then we do not need to adjust the
+ // stack pointer (we fit in the Red Zone).
+ if (Is64Bit && !Fn->hasFnAttr(Attribute::NoRedZone) &&
+ !needsStackRealignment(MF) &&
+ !MFI->hasVarSizedObjects() && // No dynamic alloca.
+ !MFI->adjustsStack() && // No calls.
+ !Subtarget->isTargetWin64()) { // Win64 has no Red Zone
+ uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
+ if (HasFP) MinSize += SlotSize;
+ StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
+ MFI->setStackSize(StackSize);
+ } else if (Subtarget->isTargetWin64()) {
+ // We need to always allocate 32 bytes as register spill area.
+ // FIXME: We might reuse these 32 bytes for leaf functions.
+ StackSize += 32;
+ MFI->setStackSize(StackSize);
+ }
+
+ // Insert stack pointer adjustment for later moving of return addr. Only
+ // applies to tail call optimized functions where the callee argument stack
+ // size is bigger than the callers.
+ if (TailCallReturnAddrDelta < 0) {
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, DL,
+ TII.get(getSUBriOpcode(Is64Bit, -TailCallReturnAddrDelta)),
+ StackPtr)
+ .addReg(StackPtr)
+ .addImm(-TailCallReturnAddrDelta);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+ }
+
+ // Mapping for machine moves:
+ //
+ // DST: VirtualFP AND
+ // SRC: VirtualFP => DW_CFA_def_cfa_offset
+ // ELSE => DW_CFA_def_cfa
+ //
+ // SRC: VirtualFP AND
+ // DST: Register => DW_CFA_def_cfa_register
+ //
+ // ELSE
+ // OFFSET < 0 => DW_CFA_offset_extended_sf
+ // REG < 64 => DW_CFA_offset + Reg
+ // ELSE => DW_CFA_offset_extended
+
+ std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+ const TargetData *TD = MF.getTarget().getTargetData();
+ uint64_t NumBytes = 0;
+ int stackGrowth = -TD->getPointerSize();
+
+ if (HasFP) {
+ // Calculate required stack adjustment.
+ uint64_t FrameSize = StackSize - SlotSize;
+ if (needsStackRealignment(MF))
+ FrameSize = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
+
+ NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+
+ // Get the offset of the stack slot for the EBP register, which is
+ // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+ // Update the frame offset adjustment.
+ MFI->setOffsetAdjustment(-NumBytes);
+
+ // Save EBP/RBP into the appropriate stack slot.
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+ .addReg(FramePtr, RegState::Kill);
+
+ if (needsFrameMoves) {
+ // Mark the place where EBP/RBP was saved.
+ MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+ BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addSym(FrameLabel);
+
+ // Define the current CFA rule to use the provided offset.
+ if (StackSize) {
+ MachineLocation SPDst(MachineLocation::VirtualFP);
+ MachineLocation SPSrc(MachineLocation::VirtualFP, 2 * stackGrowth);
+ Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
+ } else {
+ // FIXME: Verify & implement for FP
+ MachineLocation SPDst(StackPtr);
+ MachineLocation SPSrc(StackPtr, stackGrowth);
+ Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
+ }
+
+ // Change the rule for the FramePtr to be an "offset" rule.
+ MachineLocation FPDst(MachineLocation::VirtualFP, 2 * stackGrowth);
+ MachineLocation FPSrc(FramePtr);
+ Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
+ }
+
+ // Update EBP with the new base value...
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
+ .addReg(StackPtr);
+
+ if (needsFrameMoves) {
+ // Mark effective beginning of when frame pointer becomes valid.
+ MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+ BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addSym(FrameLabel);
+
+ // Define the current CFA to use the EBP/RBP register.
+ MachineLocation FPDst(FramePtr);
+ MachineLocation FPSrc(MachineLocation::VirtualFP);
+ Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
+ }
+
+ // Mark the FramePtr as live-in in every block except the entry.
+ for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
+ I != E; ++I)
+ I->addLiveIn(FramePtr);
+
+ // Realign stack
+ if (needsStackRealignment(MF)) {
+ MachineInstr *MI =
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri),
+ StackPtr).addReg(StackPtr).addImm(-MaxAlign);
+
+ // The EFLAGS implicit def is dead.
+ MI->getOperand(3).setIsDead();
+ }
+ } else {
+ NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+ }
+
+ // Skip the callee-saved push instructions.
+ bool PushedRegs = false;
+ int StackOffset = 2 * stackGrowth;
+
+ while (MBBI != MBB.end() &&
+ (MBBI->getOpcode() == X86::PUSH32r ||
+ MBBI->getOpcode() == X86::PUSH64r)) {
+ PushedRegs = true;
+ ++MBBI;
+
+ if (!HasFP && needsFrameMoves) {
+ // Mark callee-saved push instruction.
+ MCSymbol *Label = MMI.getContext().CreateTempSymbol();
+ BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addSym(Label);
+
+ // Define the current CFA rule to use the provided offset.
+ unsigned Ptr = StackSize ?
+ MachineLocation::VirtualFP : StackPtr;
+ MachineLocation SPDst(Ptr);
+ MachineLocation SPSrc(Ptr, StackOffset);
+ Moves.push_back(MachineMove(Label, SPDst, SPSrc));
+ StackOffset += stackGrowth;
+ }
+ }
+
+ DL = MBB.findDebugLoc(MBBI);
+
+ // Adjust stack pointer: ESP -= numbytes.
+ if (NumBytes >= 4096 && Subtarget->isTargetCygMing()) {
+ // Check, whether EAX is livein for this function.
+ bool isEAXAlive = false;
+ for (MachineRegisterInfo::livein_iterator
+ II = MF.getRegInfo().livein_begin(),
+ EE = MF.getRegInfo().livein_end(); (II != EE) && !isEAXAlive; ++II) {
+ unsigned Reg = II->first;
+ isEAXAlive = (Reg == X86::EAX || Reg == X86::AX ||
+ Reg == X86::AH || Reg == X86::AL);
+ }
+
+ // Function prologue calls _alloca to probe the stack when allocating more
+ // than 4k bytes in one go. Touching the stack at 4K increments is necessary
+ // to ensure that the guard pages used by the OS virtual memory manager are
+ // allocated in correct sequence.
+ if (!isEAXAlive) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+ .addImm(NumBytes);
+ BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+ .addExternalSymbol("_alloca")
+ .addReg(StackPtr, RegState::Define | RegState::Implicit);
+ } else {
+ // Save EAX
+ BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
+ .addReg(X86::EAX, RegState::Kill);
+
+ // Allocate NumBytes-4 bytes on stack. We'll also use 4 already
+ // allocated bytes for EAX.
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+ .addImm(NumBytes - 4);
+ BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+ .addExternalSymbol("_alloca")
+ .addReg(StackPtr, RegState::Define | RegState::Implicit);
+
+ // Restore EAX
+ MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
+ X86::EAX),
+ StackPtr, false, NumBytes - 4);
+ MBB.insert(MBBI, MI);
+ }
+ } else if (NumBytes) {
+ // If there is an SUB32ri of ESP immediately before this instruction, merge
+ // the two. This can be the case when tail call elimination is enabled and
+ // the callee has more arguments then the caller.
+ NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
+
+ // If there is an ADD32ri or SUB32ri of ESP immediately after this
+ // instruction, merge the two instructions.
+ mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
+
+ if (NumBytes)
+ emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII);
+ }
+
+ if ((NumBytes || PushedRegs) && needsFrameMoves) {
+ // Mark end of stack pointer adjustment.
+ MCSymbol *Label = MMI.getContext().CreateTempSymbol();
+ BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addSym(Label);
+
+ if (!HasFP && NumBytes) {
+ // Define the current CFA rule to use the provided offset.
+ if (StackSize) {
+ MachineLocation SPDst(MachineLocation::VirtualFP);
+ MachineLocation SPSrc(MachineLocation::VirtualFP,
+ -StackSize + stackGrowth);
+ Moves.push_back(MachineMove(Label, SPDst, SPSrc));
+ } else {
+ // FIXME: Verify & implement for FP
+ MachineLocation SPDst(StackPtr);
+ MachineLocation SPSrc(StackPtr, stackGrowth);
+ Moves.push_back(MachineMove(Label, SPDst, SPSrc));
+ }
+ }
+
+ // Emit DWARF info specifying the offsets of the callee-saved registers.
+ if (PushedRegs)
+ emitCalleeSavedFrameMoves(MF, Label, HasFP ? FramePtr : StackPtr);
+ }
+}
+
+void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ MachineBasicBlock::iterator MBBI = prior(MBB.end());
+ unsigned RetOpcode = MBBI->getOpcode();
+ DebugLoc DL = MBBI->getDebugLoc();
+
+ switch (RetOpcode) {
+ default:
+ llvm_unreachable("Can only insert epilog into returning blocks");
+ case X86::RET:
+ case X86::RETI:
+ case X86::TCRETURNdi:
+ case X86::TCRETURNri:
+ case X86::TCRETURNmi:
+ case X86::TCRETURNdi64:
+ case X86::TCRETURNri64:
+ case X86::TCRETURNmi64:
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64:
+ break; // These are ok
+ }
+
+ // Get the number of bytes to allocate from the FrameInfo.
+ uint64_t StackSize = MFI->getStackSize();
+ uint64_t MaxAlign = MFI->getMaxAlignment();
+ unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+ uint64_t NumBytes = 0;
+
+ if (hasFP(MF)) {
+ // Calculate required stack adjustment.
+ uint64_t FrameSize = StackSize - SlotSize;
+ if (needsStackRealignment(MF))
+ FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign;
+
+ NumBytes = FrameSize - CSSize;
+
+ // Pop EBP.
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr);
+ } else {
+ NumBytes = StackSize - CSSize;
+ }
+
+ // Skip the callee-saved pop instructions.
+ MachineBasicBlock::iterator LastCSPop = MBBI;
+ while (MBBI != MBB.begin()) {
+ MachineBasicBlock::iterator PI = prior(MBBI);
+ unsigned Opc = PI->getOpcode();
+
+ if (Opc != X86::POP32r && Opc != X86::POP64r &&
+ !PI->getDesc().isTerminator())
+ break;
+
+ --MBBI;
+ }
+
+ DL = MBBI->getDebugLoc();
+
+ // If there is an ADD32ri or SUB32ri of ESP immediately before this
+ // instruction, merge the two instructions.
+ if (NumBytes || MFI->hasVarSizedObjects())
+ mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
+
+ // If dynamic alloca is used, then reset esp to point to the last callee-saved
+ // slot before popping them off! Same applies for the case, when stack was
+ // realigned.
+ if (needsStackRealignment(MF)) {
+ // We cannot use LEA here, because stack pointer was realigned. We need to
+ // deallocate local frame back.
+ if (CSSize) {
+ emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
+ MBBI = prior(LastCSPop);
+ }
+
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
+ StackPtr).addReg(FramePtr);
+ } else if (MFI->hasVarSizedObjects()) {
+ if (CSSize) {
+ unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r;
+ MachineInstr *MI =
+ addLeaRegOffset(BuildMI(MF, DL, TII.get(Opc), StackPtr),
+ FramePtr, false, -CSSize);
+ MBB.insert(MBBI, MI);
+ } else {
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), StackPtr)
+ .addReg(FramePtr);
+ }
+ } else if (NumBytes) {
+ // Adjust stack pointer back: ESP += numbytes.
+ emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
+ }
+
+ // We're returning from function via eh_return.
+ if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) {
+ MBBI = prior(MBB.end());
+ MachineOperand &DestAddr = MBBI->getOperand(0);
+ assert(DestAddr.isReg() && "Offset should be in register!");
+ BuildMI(MBB, MBBI, DL,
+ TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
+ StackPtr).addReg(DestAddr.getReg());
+ } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi ||
+ RetOpcode == X86::TCRETURNmi ||
+ RetOpcode == X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64 ||
+ RetOpcode == X86::TCRETURNmi64) {
+ bool isMem = RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64;
+ // Tail call return: adjust the stack pointer and jump to callee.
+ MBBI = prior(MBB.end());
+ MachineOperand &JumpTarget = MBBI->getOperand(0);
+ MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
+ assert(StackAdjust.isImm() && "Expecting immediate value.");
+
+ // Adjust stack pointer.
+ int StackAdj = StackAdjust.getImm();
+ int MaxTCDelta = X86FI->getTCReturnAddrDelta();
+ int Offset = 0;
+ assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
+
+ // Incoporate the retaddr area.
+ Offset = StackAdj-MaxTCDelta;
+ assert(Offset >= 0 && "Offset should never be negative");
+
+ if (Offset) {
+ // Check for possible merge with preceeding ADD instruction.
+ Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+ emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII);
+ }
+
+ // Jump to label or value in register.
+ if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) {
+ BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNdi)
+ ? X86::TAILJMPd : X86::TAILJMPd64)).
+ addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+ JumpTarget.getTargetFlags());
+ } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) {
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNmi)
+ ? X86::TAILJMPm : X86::TAILJMPm64));
+ for (unsigned i = 0; i != 5; ++i)
+ MIB.addOperand(MBBI->getOperand(i));
+ } else if (RetOpcode == X86::TCRETURNri64) {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64), JumpTarget.getReg());
+ } else {
+ BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr), JumpTarget.getReg());
+ }
+
+ MachineInstr *NewMI = prior(MBBI);
+ for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
+ NewMI->addOperand(MBBI->getOperand(i));
+
+ // Delete the pseudo instruction TCRETURN.
+ MBB.erase(MBBI);
+ } else if ((RetOpcode == X86::RET || RetOpcode == X86::RETI) &&
+ (X86FI->getTCReturnAddrDelta() < 0)) {
+ // Add the return addr area delta back since we are not tail calling.
+ int delta = -1*X86FI->getTCReturnAddrDelta();
+ MBBI = prior(MBB.end());
+
+ // Check for possible merge with preceeding ADD instruction.
+ delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+ emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII);
+ }
+}
+
+unsigned X86RegisterInfo::getRARegister() const {
+ return Is64Bit ? X86::RIP // Should have dwarf #16.
+ : X86::EIP; // Should have dwarf #8.
+}
+
+unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+ return hasFP(MF) ? FramePtr : StackPtr;
+}
+
+void
+X86RegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves) const {
+ // Calculate amount of bytes used for return address storing
+ int stackGrowth = (Is64Bit ? -8 : -4);
+
+ // Initial state of the frame pointer is esp+stackGrowth.
+ MachineLocation Dst(MachineLocation::VirtualFP);
+ MachineLocation Src(StackPtr, stackGrowth);
+ Moves.push_back(MachineMove(0, Dst, Src));
+
+ // Add return address to move list
+ MachineLocation CSDst(StackPtr, stackGrowth);
+ MachineLocation CSSrc(getRARegister());
+ Moves.push_back(MachineMove(0, CSDst, CSSrc));
+}
+
+unsigned X86RegisterInfo::getEHExceptionRegister() const {
+ llvm_unreachable("What is the exception register");
+ return 0;
+}
+
+unsigned X86RegisterInfo::getEHHandlerRegister() const {
+ llvm_unreachable("What is the exception handler register");
+ return 0;
+}
+
+namespace llvm {
+unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return Reg;
+ case MVT::i8:
+ if (High) {
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AH;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DH;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CH;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BH;
+ }
+ } else {
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AL;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DL;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CL;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BL;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SIL;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DIL;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BPL;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SPL;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8B;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9B;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10B;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11B;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12B;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13B;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14B;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15B;
+ }
+ }
+ case MVT::i16:
+ switch (Reg) {
+ default: return Reg;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8W;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9W;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10W;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11W;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12W;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13W;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14W;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15W;
+ }
+ case MVT::i32:
+ switch (Reg) {
+ default: return Reg;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::EAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::EDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::ECX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::EBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::ESI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::EDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::EBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::ESP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8D;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9D;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10D;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11D;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12D;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13D;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14D;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15D;
+ }
+ case MVT::i64:
+ switch (Reg) {
+ default: return Reg;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::RAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::RDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::RCX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::RBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::RSI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::RDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::RBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::RSP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15;
+ }
+ }
+
+ return Reg;
+}
+}
+
+#include "X86GenRegisterInfo.inc"
+
+namespace {
+ struct MSAH : public MachineFunctionPass {
+ static char ID;
+ MSAH() : MachineFunctionPass(&ID) {}
+
+ virtual bool runOnMachineFunction(MachineFunction &MF) {
+ const X86TargetMachine *TM =
+ static_cast<const X86TargetMachine *>(&MF.getTarget());
+ const X86RegisterInfo *X86RI = TM->getRegisterInfo();
+ MachineRegisterInfo &RI = MF.getRegInfo();
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned StackAlignment = X86RI->getStackAlignment();
+
+ // Be over-conservative: scan over all vreg defs and find whether vector
+ // registers are used. If yes, there is a possibility that vector register
+ // will be spilled and thus require dynamic stack realignment.
+ for (unsigned RegNum = TargetRegisterInfo::FirstVirtualRegister;
+ RegNum < RI.getLastVirtReg(); ++RegNum)
+ if (RI.getRegClass(RegNum)->getAlignment() > StackAlignment) {
+ FuncInfo->setReserveFP(true);
+ return true;
+ }
+
+ // Nothing to do
+ return false;
+ }
+
+ virtual const char *getPassName() const {
+ return "X86 Maximal Stack Alignment Check";
+ }
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ };
+
+ char MSAH::ID = 0;
+}
+
+FunctionPass*
+llvm::createX86MaxStackAlignmentHeuristicPass() { return new MSAH(); }
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
new file mode 100644
index 0000000..d0b82e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -0,0 +1,165 @@
+//===- X86RegisterInfo.h - X86 Register Information Impl --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86REGISTERINFO_H
+#define X86REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "X86GenRegisterInfo.h.inc"
+
+namespace llvm {
+ class Type;
+ class TargetInstrInfo;
+ class X86TargetMachine;
+
+/// N86 namespace - Native X86 register numbers
+///
+namespace N86 {
+ enum {
+ EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+ };
+}
+
+/// DWARFFlavour - Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+ enum {
+ X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
+ };
+}
+
+class X86RegisterInfo : public X86GenRegisterInfo {
+public:
+ X86TargetMachine &TM;
+ const TargetInstrInfo &TII;
+
+private:
+ /// Is64Bit - Is the target 64-bits.
+ ///
+ bool Is64Bit;
+
+ /// IsWin64 - Is the target on of win64 flavours
+ ///
+ bool IsWin64;
+
+ /// SlotSize - Stack slot size in bytes.
+ ///
+ unsigned SlotSize;
+
+ /// StackAlign - Default stack alignment.
+ ///
+ unsigned StackAlign;
+
+ /// StackPtr - X86 physical register used as stack ptr.
+ ///
+ unsigned StackPtr;
+
+ /// FramePtr - X86 physical register used as frame ptr.
+ ///
+ unsigned FramePtr;
+
+public:
+ X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
+
+ /// getX86RegNum - Returns the native X86 register number for the given LLVM
+ /// register identifier.
+ static unsigned getX86RegNum(unsigned RegNo);
+
+ unsigned getStackAlignment() const { return StackAlign; }
+
+ /// getDwarfRegNum - allows modification of X86GenRegisterInfo::getDwarfRegNum
+ /// (created by TableGen) for target dependencies.
+ int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+
+ /// Code Generation virtual methods...
+ ///
+
+ /// getMatchingSuperRegClass - Return a subclass of the specified register
+ /// class A so that each register in it has a sub-register of the
+ /// specified sub-register index which is in the specified register class B.
+ virtual const TargetRegisterClass *
+ getMatchingSuperRegClass(const TargetRegisterClass *A,
+ const TargetRegisterClass *B, unsigned Idx) const;
+
+ /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
+ /// values.
+ const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
+
+ /// getCrossCopyRegClass - Returns a legal register class to copy a register
+ /// in the specified class to or from. Returns NULL if it is possible to copy
+ /// between a two registers of the specified class.
+ const TargetRegisterClass *
+ getCrossCopyRegClass(const TargetRegisterClass *RC) const;
+
+ /// getCalleeSavedRegs - Return a null-terminated list of all of the
+ /// callee-save registers on this target.
+ const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+
+ /// getCalleeSavedRegClasses - Return a null-terminated list of the preferred
+ /// register classes to spill each callee-saved register with. The order and
+ /// length of this list match the getCalleeSavedRegs() list.
+ const TargetRegisterClass* const*
+ getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
+
+ /// getReservedRegs - Returns a bitset indexed by physical register number
+ /// indicating if a register is a special register that has particular uses and
+ /// should be considered unavailable at all times, e.g. SP, RA. This is used by
+ /// register scavenger to determine what registers are free.
+ BitVector getReservedRegs(const MachineFunction &MF) const;
+
+ bool hasFP(const MachineFunction &MF) const;
+
+ bool canRealignStack(const MachineFunction &MF) const;
+
+ bool needsStackRealignment(const MachineFunction &MF) const;
+
+ bool hasReservedCallFrame(MachineFunction &MF) const;
+
+ bool hasReservedSpillSlot(MachineFunction &MF, unsigned Reg,
+ int &FrameIdx) const;
+
+ void eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const;
+
+ unsigned eliminateFrameIndex(MachineBasicBlock::iterator MI,
+ int SPAdj, FrameIndexValue *Value = NULL,
+ RegScavenger *RS = NULL) const;
+
+ void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+ RegScavenger *RS = NULL) const;
+
+ void emitCalleeSavedFrameMoves(MachineFunction &MF, MCSymbol *Label,
+ unsigned FramePtr) const;
+ void emitPrologue(MachineFunction &MF) const;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+ // Debug information queries.
+ unsigned getRARegister() const;
+ unsigned getFrameRegister(const MachineFunction &MF) const;
+ int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+ void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+
+ // Exception handling queries.
+ unsigned getEHExceptionRegister() const;
+ unsigned getEHHandlerRegister() const;
+};
+
+// getX86SubSuperRegister - X86 utility function. It returns the sub or super
+// register of a specific X86 register.
+// e.g. getX86SubSuperRegister(X86::EAX, EVT::i16) return X86:AX
+unsigned getX86SubSuperRegister(unsigned, EVT, bool High=false);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
new file mode 100644
index 0000000..91cfaa9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -0,0 +1,816 @@
+//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 Register file, defining the registers themselves,
+// aliases between the registers, and the register classes built out of the
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Register definitions...
+//
+let Namespace = "X86" in {
+
+ // Subregister indices.
+ def sub_8bit : SubRegIndex;
+ def sub_8bit_hi : SubRegIndex;
+ def sub_16bit : SubRegIndex;
+ def sub_32bit : SubRegIndex;
+
+ def sub_ss : SubRegIndex;
+ def sub_sd : SubRegIndex;
+ def sub_xmm : SubRegIndex;
+
+
+ // In the register alias definitions below, we define which registers alias
+ // which others. We only specify which registers the small registers alias,
+ // because the register file generator is smart enough to figure out that
+ // AL aliases AX if we tell it that AX aliased AL (for example).
+
+ // Dwarf numbering is different for 32-bit and 64-bit, and there are
+ // variations by target as well. Currently the first entry is for X86-64,
+ // second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
+ // and debug information on X86-32/Darwin)
+
+ // 8-bit registers
+ // Low registers
+ def AL : Register<"al">, DwarfRegNum<[0, 0, 0]>;
+ def DL : Register<"dl">, DwarfRegNum<[1, 2, 2]>;
+ def CL : Register<"cl">, DwarfRegNum<[2, 1, 1]>;
+ def BL : Register<"bl">, DwarfRegNum<[3, 3, 3]>;
+
+ // X86-64 only
+ def SIL : Register<"sil">, DwarfRegNum<[4, 6, 6]>;
+ def DIL : Register<"dil">, DwarfRegNum<[5, 7, 7]>;
+ def BPL : Register<"bpl">, DwarfRegNum<[6, 4, 5]>;
+ def SPL : Register<"spl">, DwarfRegNum<[7, 5, 4]>;
+ def R8B : Register<"r8b">, DwarfRegNum<[8, -2, -2]>;
+ def R9B : Register<"r9b">, DwarfRegNum<[9, -2, -2]>;
+ def R10B : Register<"r10b">, DwarfRegNum<[10, -2, -2]>;
+ def R11B : Register<"r11b">, DwarfRegNum<[11, -2, -2]>;
+ def R12B : Register<"r12b">, DwarfRegNum<[12, -2, -2]>;
+ def R13B : Register<"r13b">, DwarfRegNum<[13, -2, -2]>;
+ def R14B : Register<"r14b">, DwarfRegNum<[14, -2, -2]>;
+ def R15B : Register<"r15b">, DwarfRegNum<[15, -2, -2]>;
+
+ // High registers. On x86-64, these cannot be used in any instruction
+ // with a REX prefix.
+ def AH : Register<"ah">, DwarfRegNum<[0, 0, 0]>;
+ def DH : Register<"dh">, DwarfRegNum<[1, 2, 2]>;
+ def CH : Register<"ch">, DwarfRegNum<[2, 1, 1]>;
+ def BH : Register<"bh">, DwarfRegNum<[3, 3, 3]>;
+
+ // 16-bit registers
+ let SubRegIndices = [sub_8bit, sub_8bit_hi] in {
+ def AX : RegisterWithSubRegs<"ax", [AL,AH]>, DwarfRegNum<[0, 0, 0]>;
+ def DX : RegisterWithSubRegs<"dx", [DL,DH]>, DwarfRegNum<[1, 2, 2]>;
+ def CX : RegisterWithSubRegs<"cx", [CL,CH]>, DwarfRegNum<[2, 1, 1]>;
+ def BX : RegisterWithSubRegs<"bx", [BL,BH]>, DwarfRegNum<[3, 3, 3]>;
+ }
+ let SubRegIndices = [sub_8bit] in {
+ def SI : RegisterWithSubRegs<"si", [SIL]>, DwarfRegNum<[4, 6, 6]>;
+ def DI : RegisterWithSubRegs<"di", [DIL]>, DwarfRegNum<[5, 7, 7]>;
+ def BP : RegisterWithSubRegs<"bp", [BPL]>, DwarfRegNum<[6, 4, 5]>;
+ def SP : RegisterWithSubRegs<"sp", [SPL]>, DwarfRegNum<[7, 5, 4]>;
+ }
+ def IP : Register<"ip">, DwarfRegNum<[16]>;
+
+ // X86-64 only
+ let SubRegIndices = [sub_8bit] in {
+ def R8W : RegisterWithSubRegs<"r8w", [R8B]>, DwarfRegNum<[8, -2, -2]>;
+ def R9W : RegisterWithSubRegs<"r9w", [R9B]>, DwarfRegNum<[9, -2, -2]>;
+ def R10W : RegisterWithSubRegs<"r10w", [R10B]>, DwarfRegNum<[10, -2, -2]>;
+ def R11W : RegisterWithSubRegs<"r11w", [R11B]>, DwarfRegNum<[11, -2, -2]>;
+ def R12W : RegisterWithSubRegs<"r12w", [R12B]>, DwarfRegNum<[12, -2, -2]>;
+ def R13W : RegisterWithSubRegs<"r13w", [R13B]>, DwarfRegNum<[13, -2, -2]>;
+ def R14W : RegisterWithSubRegs<"r14w", [R14B]>, DwarfRegNum<[14, -2, -2]>;
+ def R15W : RegisterWithSubRegs<"r15w", [R15B]>, DwarfRegNum<[15, -2, -2]>;
+ }
+ // 32-bit registers
+ let SubRegIndices = [sub_16bit] in {
+ def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[0, 0, 0]>;
+ def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[1, 2, 2]>;
+ def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[2, 1, 1]>;
+ def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[3, 3, 3]>;
+ def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[4, 6, 6]>;
+ def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[5, 7, 7]>;
+ def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[6, 4, 5]>;
+ def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[7, 5, 4]>;
+ def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[16, 8, 8]>;
+
+ // X86-64 only
+ def R8D : RegisterWithSubRegs<"r8d", [R8W]>, DwarfRegNum<[8, -2, -2]>;
+ def R9D : RegisterWithSubRegs<"r9d", [R9W]>, DwarfRegNum<[9, -2, -2]>;
+ def R10D : RegisterWithSubRegs<"r10d", [R10W]>, DwarfRegNum<[10, -2, -2]>;
+ def R11D : RegisterWithSubRegs<"r11d", [R11W]>, DwarfRegNum<[11, -2, -2]>;
+ def R12D : RegisterWithSubRegs<"r12d", [R12W]>, DwarfRegNum<[12, -2, -2]>;
+ def R13D : RegisterWithSubRegs<"r13d", [R13W]>, DwarfRegNum<[13, -2, -2]>;
+ def R14D : RegisterWithSubRegs<"r14d", [R14W]>, DwarfRegNum<[14, -2, -2]>;
+ def R15D : RegisterWithSubRegs<"r15d", [R15W]>, DwarfRegNum<[15, -2, -2]>;
+ }
+
+ // 64-bit registers, X86-64 only
+ let SubRegIndices = [sub_32bit] in {
+ def RAX : RegisterWithSubRegs<"rax", [EAX]>, DwarfRegNum<[0, -2, -2]>;
+ def RDX : RegisterWithSubRegs<"rdx", [EDX]>, DwarfRegNum<[1, -2, -2]>;
+ def RCX : RegisterWithSubRegs<"rcx", [ECX]>, DwarfRegNum<[2, -2, -2]>;
+ def RBX : RegisterWithSubRegs<"rbx", [EBX]>, DwarfRegNum<[3, -2, -2]>;
+ def RSI : RegisterWithSubRegs<"rsi", [ESI]>, DwarfRegNum<[4, -2, -2]>;
+ def RDI : RegisterWithSubRegs<"rdi", [EDI]>, DwarfRegNum<[5, -2, -2]>;
+ def RBP : RegisterWithSubRegs<"rbp", [EBP]>, DwarfRegNum<[6, -2, -2]>;
+ def RSP : RegisterWithSubRegs<"rsp", [ESP]>, DwarfRegNum<[7, -2, -2]>;
+
+ def R8 : RegisterWithSubRegs<"r8", [R8D]>, DwarfRegNum<[8, -2, -2]>;
+ def R9 : RegisterWithSubRegs<"r9", [R9D]>, DwarfRegNum<[9, -2, -2]>;
+ def R10 : RegisterWithSubRegs<"r10", [R10D]>, DwarfRegNum<[10, -2, -2]>;
+ def R11 : RegisterWithSubRegs<"r11", [R11D]>, DwarfRegNum<[11, -2, -2]>;
+ def R12 : RegisterWithSubRegs<"r12", [R12D]>, DwarfRegNum<[12, -2, -2]>;
+ def R13 : RegisterWithSubRegs<"r13", [R13D]>, DwarfRegNum<[13, -2, -2]>;
+ def R14 : RegisterWithSubRegs<"r14", [R14D]>, DwarfRegNum<[14, -2, -2]>;
+ def R15 : RegisterWithSubRegs<"r15", [R15D]>, DwarfRegNum<[15, -2, -2]>;
+ def RIP : RegisterWithSubRegs<"rip", [EIP]>, DwarfRegNum<[16, -2, -2]>;
+ }
+
+ // MMX Registers. These are actually aliased to ST0 .. ST7
+ def MM0 : Register<"mm0">, DwarfRegNum<[41, 29, 29]>;
+ def MM1 : Register<"mm1">, DwarfRegNum<[42, 30, 30]>;
+ def MM2 : Register<"mm2">, DwarfRegNum<[43, 31, 31]>;
+ def MM3 : Register<"mm3">, DwarfRegNum<[44, 32, 32]>;
+ def MM4 : Register<"mm4">, DwarfRegNum<[45, 33, 33]>;
+ def MM5 : Register<"mm5">, DwarfRegNum<[46, 34, 34]>;
+ def MM6 : Register<"mm6">, DwarfRegNum<[47, 35, 35]>;
+ def MM7 : Register<"mm7">, DwarfRegNum<[48, 36, 36]>;
+
+ // Pseudo Floating Point registers
+ def FP0 : Register<"fp0">;
+ def FP1 : Register<"fp1">;
+ def FP2 : Register<"fp2">;
+ def FP3 : Register<"fp3">;
+ def FP4 : Register<"fp4">;
+ def FP5 : Register<"fp5">;
+ def FP6 : Register<"fp6">;
+
+ // XMM Registers, used by the various SSE instruction set extensions.
+ // The sub_ss and sub_sd subregs are the same registers with another regclass.
+ let CompositeIndices = [(sub_ss), (sub_sd)] in {
+ def XMM0: Register<"xmm0">, DwarfRegNum<[17, 21, 21]>;
+ def XMM1: Register<"xmm1">, DwarfRegNum<[18, 22, 22]>;
+ def XMM2: Register<"xmm2">, DwarfRegNum<[19, 23, 23]>;
+ def XMM3: Register<"xmm3">, DwarfRegNum<[20, 24, 24]>;
+ def XMM4: Register<"xmm4">, DwarfRegNum<[21, 25, 25]>;
+ def XMM5: Register<"xmm5">, DwarfRegNum<[22, 26, 26]>;
+ def XMM6: Register<"xmm6">, DwarfRegNum<[23, 27, 27]>;
+ def XMM7: Register<"xmm7">, DwarfRegNum<[24, 28, 28]>;
+
+ // X86-64 only
+ def XMM8: Register<"xmm8">, DwarfRegNum<[25, -2, -2]>;
+ def XMM9: Register<"xmm9">, DwarfRegNum<[26, -2, -2]>;
+ def XMM10: Register<"xmm10">, DwarfRegNum<[27, -2, -2]>;
+ def XMM11: Register<"xmm11">, DwarfRegNum<[28, -2, -2]>;
+ def XMM12: Register<"xmm12">, DwarfRegNum<[29, -2, -2]>;
+ def XMM13: Register<"xmm13">, DwarfRegNum<[30, -2, -2]>;
+ def XMM14: Register<"xmm14">, DwarfRegNum<[31, -2, -2]>;
+ def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>;
+ }
+
+ // YMM Registers, used by AVX instructions
+ let SubRegIndices = [sub_xmm] in {
+ def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegNum<[17, 21, 21]>;
+ def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegNum<[18, 22, 22]>;
+ def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegNum<[19, 23, 23]>;
+ def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegNum<[20, 24, 24]>;
+ def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegNum<[21, 25, 25]>;
+ def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegNum<[22, 26, 26]>;
+ def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegNum<[23, 27, 27]>;
+ def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegNum<[24, 28, 28]>;
+ def YMM8: RegisterWithSubRegs<"ymm8", [XMM8]>, DwarfRegNum<[25, -2, -2]>;
+ def YMM9: RegisterWithSubRegs<"ymm9", [XMM9]>, DwarfRegNum<[26, -2, -2]>;
+ def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegNum<[27, -2, -2]>;
+ def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegNum<[28, -2, -2]>;
+ def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegNum<[29, -2, -2]>;
+ def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegNum<[30, -2, -2]>;
+ def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegNum<[31, -2, -2]>;
+ def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegNum<[32, -2, -2]>;
+ }
+
+ // Floating point stack registers
+ def ST0 : Register<"st(0)">, DwarfRegNum<[33, 12, 11]>;
+ def ST1 : Register<"st(1)">, DwarfRegNum<[34, 13, 12]>;
+ def ST2 : Register<"st(2)">, DwarfRegNum<[35, 14, 13]>;
+ def ST3 : Register<"st(3)">, DwarfRegNum<[36, 15, 14]>;
+ def ST4 : Register<"st(4)">, DwarfRegNum<[37, 16, 15]>;
+ def ST5 : Register<"st(5)">, DwarfRegNum<[38, 17, 16]>;
+ def ST6 : Register<"st(6)">, DwarfRegNum<[39, 18, 17]>;
+ def ST7 : Register<"st(7)">, DwarfRegNum<[40, 19, 18]>;
+
+ // Status flags register
+ def EFLAGS : Register<"flags">;
+
+ // Segment registers
+ def CS : Register<"cs">;
+ def DS : Register<"ds">;
+ def SS : Register<"ss">;
+ def ES : Register<"es">;
+ def FS : Register<"fs">;
+ def GS : Register<"gs">;
+
+ // Debug registers
+ def DR0 : Register<"dr0">;
+ def DR1 : Register<"dr1">;
+ def DR2 : Register<"dr2">;
+ def DR3 : Register<"dr3">;
+ def DR4 : Register<"dr4">;
+ def DR5 : Register<"dr5">;
+ def DR6 : Register<"dr6">;
+ def DR7 : Register<"dr7">;
+
+ // Condition registers
+ def CR0 : Register<"cr0">;
+ def CR1 : Register<"cr1">;
+ def CR2 : Register<"cr2">;
+ def CR3 : Register<"cr3">;
+ def CR4 : Register<"cr4">;
+ def CR5 : Register<"cr5">;
+ def CR6 : Register<"cr6">;
+ def CR7 : Register<"cr7">;
+ def CR8 : Register<"cr8">;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes. The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B.
+// Allocate R12 and R13 last, as these require an extra byte when
+// encoded in x86_64 instructions.
+// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in
+// 64-bit mode. The main complication is that they cannot be encoded in an
+// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc.
+// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
+// cannot be encoded.
+def GR8 : RegisterClass<"X86", [i8], 8,
+ [AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL,
+ R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B]> {
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ static const unsigned X86_GR8_AO_64[] = {
+ X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL,
+ X86::R8B, X86::R9B, X86::R10B, X86::R11B,
+ X86::BL, X86::R14B, X86::R15B, X86::R12B, X86::R13B, X86::BPL
+ };
+
+ GR8Class::iterator
+ GR8Class::allocation_order_begin(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (Subtarget.is64Bit())
+ return X86_GR8_AO_64;
+ else
+ return begin();
+ }
+
+ GR8Class::iterator
+ GR8Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
+ // Does the function dedicate RBP / EBP to being a frame ptr?
+ if (!Subtarget.is64Bit())
+ // In 32-mode, none of the 8-bit registers aliases EBP or ESP.
+ return begin() + 8;
+ else if (RI->hasFP(MF) || MFI->getReserveFP())
+ // If so, don't allocate SPL or BPL.
+ return array_endof(X86_GR8_AO_64) - 1;
+ else
+ // If not, just don't allocate SPL.
+ return array_endof(X86_GR8_AO_64);
+ }
+ }];
+}
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+ [AX, CX, DX, SI, DI, BX, BP, SP,
+ R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W]> {
+ let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi)];
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ static const unsigned X86_GR16_AO_64[] = {
+ X86::AX, X86::CX, X86::DX, X86::SI, X86::DI,
+ X86::R8W, X86::R9W, X86::R10W, X86::R11W,
+ X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W, X86::BP
+ };
+
+ GR16Class::iterator
+ GR16Class::allocation_order_begin(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (Subtarget.is64Bit())
+ return X86_GR16_AO_64;
+ else
+ return begin();
+ }
+
+ GR16Class::iterator
+ GR16Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
+ if (Subtarget.is64Bit()) {
+ // Does the function dedicate RBP to being a frame ptr?
+ if (RI->hasFP(MF) || MFI->getReserveFP())
+ // If so, don't allocate SP or BP.
+ return array_endof(X86_GR16_AO_64) - 1;
+ else
+ // If not, just don't allocate SP.
+ return array_endof(X86_GR16_AO_64);
+ } else {
+ // Does the function dedicate EBP to being a frame ptr?
+ if (RI->hasFP(MF) || MFI->getReserveFP())
+ // If so, don't allocate SP or BP.
+ return begin() + 6;
+ else
+ // If not, just don't allocate SP.
+ return begin() + 7;
+ }
+ }
+ }];
+}
+
+def GR32 : RegisterClass<"X86", [i32], 32,
+ [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+ R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> {
+ let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ static const unsigned X86_GR32_AO_64[] = {
+ X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI,
+ X86::R8D, X86::R9D, X86::R10D, X86::R11D,
+ X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP
+ };
+
+ GR32Class::iterator
+ GR32Class::allocation_order_begin(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (Subtarget.is64Bit())
+ return X86_GR32_AO_64;
+ else
+ return begin();
+ }
+
+ GR32Class::iterator
+ GR32Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
+ if (Subtarget.is64Bit()) {
+ // Does the function dedicate RBP to being a frame ptr?
+ if (RI->hasFP(MF) || MFI->getReserveFP())
+ // If so, don't allocate ESP or EBP.
+ return array_endof(X86_GR32_AO_64) - 1;
+ else
+ // If not, just don't allocate ESP.
+ return array_endof(X86_GR32_AO_64);
+ } else {
+ // Does the function dedicate EBP to being a frame ptr?
+ if (RI->hasFP(MF) || MFI->getReserveFP())
+ // If so, don't allocate ESP or EBP.
+ return begin() + 6;
+ else
+ // If not, just don't allocate ESP.
+ return begin() + 7;
+ }
+ }
+ }];
+}
+
+// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
+// RIP isn't really a register and it can't be used anywhere except in an
+// address, but it doesn't cause trouble.
+def GR64 : RegisterClass<"X86", [i64], 64,
+ [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ RBX, R14, R15, R12, R13, RBP, RSP, RIP]> {
+ let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
+ (GR16 sub_16bit),
+ (GR32 sub_32bit)];
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ GR64Class::iterator
+ GR64Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
+ if (!Subtarget.is64Bit())
+ return begin(); // None of these are allocatable in 32-bit.
+ // Does the function dedicate RBP to being a frame ptr?
+ if (RI->hasFP(MF) || MFI->getReserveFP())
+ return end()-3; // If so, don't allocate RIP, RSP or RBP
+ else
+ return end()-2; // If not, just don't allocate RIP or RSP
+ }
+ }];
+}
+
+// Segment registers for use by MOV instructions (and others) that have a
+// segment register as one operand. Always contain a 16-bit segment
+// descriptor.
+def SEGMENT_REG : RegisterClass<"X86", [i16], 16, [CS, DS, SS, ES, FS, GS]> {
+}
+
+// Debug registers.
+def DEBUG_REG : RegisterClass<"X86", [i32], 32,
+ [DR0, DR1, DR2, DR3, DR4, DR5, DR6, DR7]> {
+}
+
+// Control registers.
+def CONTROL_REG : RegisterClass<"X86", [i64], 64,
+ [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7, CR8]> {
+}
+
+// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d"
+// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers
+// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD,
+// and GR64_ABCD are classes for registers that support 8-bit h-register
+// operations.
+def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, [AL, CL, DL, BL]> {
+}
+def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, [AH, CH, DH, BH]> {
+}
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, [AX, CX, DX, BX]> {
+ let SubRegClasses = [(GR8_ABCD_L sub_8bit), (GR8_ABCD_H sub_8bit_hi)];
+}
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, EBX]> {
+ let SubRegClasses = [(GR8_ABCD_L sub_8bit),
+ (GR8_ABCD_H sub_8bit_hi),
+ (GR16_ABCD sub_16bit)];
+}
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RBX]> {
+ let SubRegClasses = [(GR8_ABCD_L sub_8bit),
+ (GR8_ABCD_H sub_8bit_hi),
+ (GR16_ABCD sub_16bit),
+ (GR32_ABCD sub_32bit)];
+}
+def GR32_TC : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX]> {
+ let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
+}
+def GR64_TC : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RSI, RDI,
+ R8, R9, R11]> {
+ let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
+ (GR16 sub_16bit),
+ (GR32_TC sub_32bit)];
+}
+
+// GR8_NOREX - GR8 registers which do not require a REX prefix.
+def GR8_NOREX : RegisterClass<"X86", [i8], 8,
+ [AL, CL, DL, AH, CH, DH, BL, BH]> {
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ // In 64-bit mode, it's not safe to blindly allocate H registers.
+ static const unsigned X86_GR8_NOREX_AO_64[] = {
+ X86::AL, X86::CL, X86::DL, X86::BL
+ };
+
+ GR8_NOREXClass::iterator
+ GR8_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (Subtarget.is64Bit())
+ return X86_GR8_NOREX_AO_64;
+ else
+ return begin();
+ }
+
+ GR8_NOREXClass::iterator
+ GR8_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (Subtarget.is64Bit())
+ return array_endof(X86_GR8_NOREX_AO_64);
+ else
+ return end();
+ }
+ }];
+}
+// GR16_NOREX - GR16 registers which do not require a REX prefix.
+def GR16_NOREX : RegisterClass<"X86", [i16], 16,
+ [AX, CX, DX, SI, DI, BX, BP, SP]> {
+ let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi)];
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ GR16_NOREXClass::iterator
+ GR16_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
+ // Does the function dedicate RBP / EBP to being a frame ptr?
+ if (RI->hasFP(MF) || MFI->getReserveFP())
+ // If so, don't allocate SP or BP.
+ return end() - 2;
+ else
+ // If not, just don't allocate SP.
+ return end() - 1;
+ }
+ }];
+}
+// GR32_NOREX - GR32 registers which do not require a REX prefix.
+def GR32_NOREX : RegisterClass<"X86", [i32], 32,
+ [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP]> {
+ let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
+ (GR16_NOREX sub_16bit)];
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ GR32_NOREXClass::iterator
+ GR32_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
+ // Does the function dedicate RBP / EBP to being a frame ptr?
+ if (RI->hasFP(MF) || MFI->getReserveFP())
+ // If so, don't allocate ESP or EBP.
+ return end() - 2;
+ else
+ // If not, just don't allocate ESP.
+ return end() - 1;
+ }
+ }];
+}
+// GR64_NOREX - GR64 registers which do not require a REX prefix.
+def GR64_NOREX : RegisterClass<"X86", [i64], 64,
+ [RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP]> {
+ let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
+ (GR16_NOREX sub_16bit),
+ (GR32_NOREX sub_32bit)];
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ GR64_NOREXClass::iterator
+ GR64_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
+ // Does the function dedicate RBP to being a frame ptr?
+ if (RI->hasFP(MF) || MFI->getReserveFP())
+ // If so, don't allocate RIP, RSP or RBP.
+ return end() - 3;
+ else
+ // If not, just don't allocate RIP or RSP.
+ return end() - 2;
+ }
+ }];
+}
+
+// GR32_NOSP - GR32 registers except ESP.
+def GR32_NOSP : RegisterClass<"X86", [i32], 32,
+ [EAX, ECX, EDX, ESI, EDI, EBX, EBP,
+ R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> {
+ let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ static const unsigned X86_GR32_NOSP_AO_64[] = {
+ X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI,
+ X86::R8D, X86::R9D, X86::R10D, X86::R11D,
+ X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP
+ };
+
+ GR32_NOSPClass::iterator
+ GR32_NOSPClass::allocation_order_begin(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (Subtarget.is64Bit())
+ return X86_GR32_NOSP_AO_64;
+ else
+ return begin();
+ }
+
+ GR32_NOSPClass::iterator
+ GR32_NOSPClass::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
+ if (Subtarget.is64Bit()) {
+ // Does the function dedicate RBP to being a frame ptr?
+ if (RI->hasFP(MF) || MFI->getReserveFP())
+ // If so, don't allocate EBP.
+ return array_endof(X86_GR32_NOSP_AO_64) - 1;
+ else
+ // If not, any reg in this class is ok.
+ return array_endof(X86_GR32_NOSP_AO_64);
+ } else {
+ // Does the function dedicate EBP to being a frame ptr?
+ if (RI->hasFP(MF) || MFI->getReserveFP())
+ // If so, don't allocate EBP.
+ return begin() + 6;
+ else
+ // If not, any reg in this class is ok.
+ return begin() + 7;
+ }
+ }
+ }];
+}
+
+// GR64_NOSP - GR64 registers except RSP (and RIP).
+def GR64_NOSP : RegisterClass<"X86", [i64], 64,
+ [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ RBX, R14, R15, R12, R13, RBP]> {
+ let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi),
+ (GR16 sub_16bit),
+ (GR32_NOSP sub_32bit)];
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ GR64_NOSPClass::iterator
+ GR64_NOSPClass::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
+ if (!Subtarget.is64Bit())
+ return begin(); // None of these are allocatable in 32-bit.
+ // Does the function dedicate RBP to being a frame ptr?
+ if (RI->hasFP(MF) || MFI->getReserveFP())
+ return end()-1; // If so, don't allocate RBP
+ else
+ return end(); // If not, any reg in this class is ok.
+ }
+ }];
+}
+
+// GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
+def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
+ [RAX, RCX, RDX, RSI, RDI, RBX, RBP]> {
+ let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi),
+ (GR16_NOREX sub_16bit),
+ (GR32_NOREX sub_32bit)];
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ GR64_NOREX_NOSPClass::iterator
+ GR64_NOREX_NOSPClass::allocation_order_end(const MachineFunction &MF) const
+ {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetRegisterInfo *RI = TM.getRegisterInfo();
+ const X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
+ // Does the function dedicate RBP to being a frame ptr?
+ if (RI->hasFP(MF) || MFI->getReserveFP())
+ // If so, don't allocate RBP.
+ return end() - 1;
+ else
+ // If not, any reg in this class is ok.
+ return end();
+ }
+ }];
+}
+
+// A class to support the 'A' assembler constraint: EAX then EDX.
+def GR32_AD : RegisterClass<"X86", [i32], 32, [EAX, EDX]> {
+ let SubRegClasses = [(GR8_ABCD_L sub_8bit),
+ (GR8_ABCD_H sub_8bit_hi),
+ (GR16_ABCD sub_16bit)];
+}
+
+// Scalar SSE2 floating point registers.
+def FR32 : RegisterClass<"X86", [f32], 32,
+ [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11,
+ XMM12, XMM13, XMM14, XMM15]> {
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ FR32Class::iterator
+ FR32Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+ else
+ return end();
+ }
+ }];
+}
+
+def FR64 : RegisterClass<"X86", [f64], 64,
+ [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11,
+ XMM12, XMM13, XMM14, XMM15]> {
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ FR64Class::iterator
+ FR64Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+ else
+ return end();
+ }
+ }];
+}
+
+
+// FIXME: This sets up the floating point register files as though they are f64
+// values, though they really are f80 values. This will cause us to spill
+// values as 64-bit quantities instead of 80-bit quantities, which is much much
+// faster on common hardware. In reality, this should be controlled by a
+// command line option or something.
+
+def RFP32 : RegisterClass<"X86",[f32], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+def RFP64 : RegisterClass<"X86",[f64], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+def RFP80 : RegisterClass<"X86",[f80], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+
+// Floating point stack registers (these are not allocatable by the
+// register allocator - the floating point stackifier is responsible
+// for transforming FPn allocations to STn registers)
+def RST : RegisterClass<"X86", [f80, f64, f32], 32,
+ [ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7]> {
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ RSTClass::iterator
+ RSTClass::allocation_order_end(const MachineFunction &MF) const {
+ return begin();
+ }
+ }];
+}
+
+// Generic vector registers: VR64 and VR128.
+def VR64 : RegisterClass<"X86", [v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+ [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7]>;
+def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
+ [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11,
+ XMM12, XMM13, XMM14, XMM15]> {
+ let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd)];
+
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ VR128Class::iterator
+ VR128Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+ else
+ return end();
+ }
+ }];
+}
+def VR256 : RegisterClass<"X86", [ v8i32, v4i64, v8f32, v4f64],256,
+ [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+ YMM8, YMM9, YMM10, YMM11,
+ YMM12, YMM13, YMM14, YMM15]> {
+ let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd), (VR128 sub_xmm)];
+}
+
+// Status flags registers.
+def CCR : RegisterClass<"X86", [i32], 32, [EFLAGS]> {
+ let CopyCost = -1; // Don't allow copying of status registers.
+}
diff --git a/contrib/llvm/lib/Target/X86/X86Relocations.h b/contrib/llvm/lib/Target/X86/X86Relocations.h
new file mode 100644
index 0000000..990962d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Relocations.h
@@ -0,0 +1,52 @@
+//===- X86Relocations.h - X86 Code Relocations ------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86RELOCATIONS_H
+#define X86RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+ namespace X86 {
+ /// RelocationType - An enum for the x86 relocation codes. Note that
+ /// the terminology here doesn't follow x86 convention - word means
+ /// 32-bit and dword means 64-bit. The relocations will be treated
+ /// by JIT or ObjectCode emitters, this is transparent to the x86 code
+ /// emitter but JIT and ObjectCode will treat them differently
+ enum RelocationType {
+ /// reloc_pcrel_word - PC relative relocation, add the relocated value to
+ /// the value already in memory, after we adjust it for where the PC is.
+ reloc_pcrel_word = 0,
+
+ /// reloc_picrel_word - PIC base relative relocation, add the relocated
+ /// value to the value already in memory, after we adjust it for where the
+ /// PIC base is.
+ reloc_picrel_word = 1,
+
+ /// reloc_absolute_word - absolute relocation, just add the relocated
+ /// value to the value already in memory.
+ reloc_absolute_word = 2,
+
+ /// reloc_absolute_word_sext - absolute relocation, just add the relocated
+ /// value to the value already in memory. In object files, it represents a
+ /// value which must be sign-extended when resolving the relocation.
+ reloc_absolute_word_sext = 3,
+
+ /// reloc_absolute_dword - absolute relocation, just add the relocated
+ /// value to the value already in memory.
+ reloc_absolute_dword = 4
+ };
+ }
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
new file mode 100644
index 0000000..6297a27
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -0,0 +1,243 @@
+//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-selectiondag-info"
+#include "X86TargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+using namespace llvm;
+
+X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) :
+ TargetSelectionDAGInfo(TM),
+ Subtarget(&TM.getSubtarget<X86Subtarget>()),
+ TLI(*TM.getTargetLowering()) {
+}
+
+X86SelectionDAGInfo::~X86SelectionDAGInfo() {
+}
+
+SDValue
+X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+ SDValue Chain,
+ SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align,
+ bool isVolatile,
+ const Value *DstSV,
+ uint64_t DstSVOff) const {
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+
+ // If not DWORD aligned or size is more than the threshold, call the library.
+ // The libc version is likely to be faster for these cases. It can use the
+ // address value and run time information about the CPU.
+ if ((Align & 3) != 0 ||
+ !ConstantSize ||
+ ConstantSize->getZExtValue() >
+ Subtarget->getMaxInlineSizeThreshold()) {
+ SDValue InFlag(0, 0);
+
+ // Check to see if there is a specialized entry-point for memory zeroing.
+ ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+
+ if (const char *bzeroEntry = V &&
+ V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
+ EVT IntPtr = TLI.getPointerTy();
+ const Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Node = Dst;
+ Entry.Ty = IntPtrTy;
+ Args.push_back(Entry);
+ Entry.Node = Size;
+ Args.push_back(Entry);
+ std::pair<SDValue,SDValue> CallResult =
+ TLI.LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
+ false, false, false, false,
+ 0, CallingConv::C, false, /*isReturnValueUsed=*/false,
+ DAG.getExternalSymbol(bzeroEntry, IntPtr), Args,
+ DAG, dl);
+ return CallResult.second;
+ }
+
+ // Otherwise have the target-independent code call memset.
+ return SDValue();
+ }
+
+ uint64_t SizeVal = ConstantSize->getZExtValue();
+ SDValue InFlag(0, 0);
+ EVT AVT;
+ SDValue Count;
+ ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
+ unsigned BytesLeft = 0;
+ bool TwoRepStos = false;
+ if (ValC) {
+ unsigned ValReg;
+ uint64_t Val = ValC->getZExtValue() & 255;
+
+ // If the value is a constant, then we can potentially use larger sets.
+ switch (Align & 3) {
+ case 2: // WORD aligned
+ AVT = MVT::i16;
+ ValReg = X86::AX;
+ Val = (Val << 8) | Val;
+ break;
+ case 0: // DWORD aligned
+ AVT = MVT::i32;
+ ValReg = X86::EAX;
+ Val = (Val << 8) | Val;
+ Val = (Val << 16) | Val;
+ if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned
+ AVT = MVT::i64;
+ ValReg = X86::RAX;
+ Val = (Val << 32) | Val;
+ }
+ break;
+ default: // Byte aligned
+ AVT = MVT::i8;
+ ValReg = X86::AL;
+ Count = DAG.getIntPtrConstant(SizeVal);
+ break;
+ }
+
+ if (AVT.bitsGT(MVT::i8)) {
+ unsigned UBytes = AVT.getSizeInBits() / 8;
+ Count = DAG.getIntPtrConstant(SizeVal / UBytes);
+ BytesLeft = SizeVal % UBytes;
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ } else {
+ AVT = MVT::i8;
+ Count = DAG.getIntPtrConstant(SizeVal);
+ Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
+ X86::ECX,
+ Count, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
+ X86::EDI,
+ Dst, InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+ Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
+
+ if (TwoRepStos) {
+ InFlag = Chain.getValue(1);
+ Count = Size;
+ EVT CVT = Count.getValueType();
+ SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
+ DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
+ Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
+ X86::ECX,
+ Left, InFlag);
+ InFlag = Chain.getValue(1);
+ Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
+ Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
+ } else if (BytesLeft) {
+ // Handle the last 1 - 7 bytes.
+ unsigned Offset = SizeVal - BytesLeft;
+ EVT AddrVT = Dst.getValueType();
+ EVT SizeVT = Size.getValueType();
+
+ Chain = DAG.getMemset(Chain, dl,
+ DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
+ DAG.getConstant(Offset, AddrVT)),
+ Src,
+ DAG.getConstant(BytesLeft, SizeVT),
+ Align, isVolatile, DstSV, DstSVOff + Offset);
+ }
+
+ // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
+ return Chain;
+}
+
+SDValue
+X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align,
+ bool isVolatile, bool AlwaysInline,
+ const Value *DstSV,
+ uint64_t DstSVOff,
+ const Value *SrcSV,
+ uint64_t SrcSVOff) const {
+ // This requires the copy size to be a constant, preferrably
+ // within a subtarget-specific limit.
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+ if (!ConstantSize)
+ return SDValue();
+ uint64_t SizeVal = ConstantSize->getZExtValue();
+ if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold())
+ return SDValue();
+
+ /// If not DWORD aligned, call the library.
+ if ((Align & 3) != 0)
+ return SDValue();
+
+ // DWORD aligned
+ EVT AVT = MVT::i32;
+ if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned
+ AVT = MVT::i64;
+
+ unsigned UBytes = AVT.getSizeInBits() / 8;
+ unsigned CountVal = SizeVal / UBytes;
+ SDValue Count = DAG.getIntPtrConstant(CountVal);
+ unsigned BytesLeft = SizeVal % UBytes;
+
+ SDValue InFlag(0, 0);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
+ X86::ECX,
+ Count, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
+ X86::EDI,
+ Dst, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
+ X86::ESI,
+ Src, InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+ SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops,
+ array_lengthof(Ops));
+
+ SmallVector<SDValue, 4> Results;
+ Results.push_back(RepMovs);
+ if (BytesLeft) {
+ // Handle the last 1 - 7 bytes.
+ unsigned Offset = SizeVal - BytesLeft;
+ EVT DstVT = Dst.getValueType();
+ EVT SrcVT = Src.getValueType();
+ EVT SizeVT = Size.getValueType();
+ Results.push_back(DAG.getMemcpy(Chain, dl,
+ DAG.getNode(ISD::ADD, dl, DstVT, Dst,
+ DAG.getConstant(Offset, DstVT)),
+ DAG.getNode(ISD::ADD, dl, SrcVT, Src,
+ DAG.getConstant(Offset, SrcVT)),
+ DAG.getConstant(BytesLeft, SizeVT),
+ Align, isVolatile, AlwaysInline,
+ DstSV, DstSVOff + Offset,
+ SrcSV, SrcSVOff + Offset));
+ }
+
+ return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ &Results[0], Results.size());
+}
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
new file mode 100644
index 0000000..4f30f31
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -0,0 +1,59 @@
+//===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86SELECTIONDAGINFO_H
+#define X86SELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class X86TargetLowering;
+class X86TargetMachine;
+class X86Subtarget;
+
+class X86SelectionDAGInfo : public TargetSelectionDAGInfo {
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ const X86TargetLowering &TLI;
+
+public:
+ explicit X86SelectionDAGInfo(const X86TargetMachine &TM);
+ ~X86SelectionDAGInfo();
+
+ virtual
+ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+ SDValue Chain,
+ SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align,
+ bool isVolatile,
+ const Value *DstSV,
+ uint64_t DstSVOff) const;
+
+ virtual
+ SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+ SDValue Chain,
+ SDValue Dst, SDValue Src,
+ SDValue Size, unsigned Align,
+ bool isVolatile, bool AlwaysInline,
+ const Value *DstSV,
+ uint64_t DstSVOff,
+ const Value *SrcSV,
+ uint64_t SrcSVOff) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
new file mode 100644
index 0000000..09a2685
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -0,0 +1,374 @@
+//===-- X86Subtarget.cpp - X86 Subtarget Information ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "subtarget"
+#include "X86Subtarget.h"
+#include "X86InstrInfo.h"
+#include "X86GenSubtarget.inc"
+#include "llvm/GlobalValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/System/Host.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+/// ClassifyBlockAddressReference - Classify a blockaddress reference for the
+/// current subtarget according to how we should reference it in a non-pcrel
+/// context.
+unsigned char X86Subtarget::
+ClassifyBlockAddressReference() const {
+ if (isPICStyleGOT()) // 32-bit ELF targets.
+ return X86II::MO_GOTOFF;
+
+ if (isPICStyleStubPIC()) // Darwin/32 in PIC mode.
+ return X86II::MO_PIC_BASE_OFFSET;
+
+ // Direct static reference to label.
+ return X86II::MO_NO_FLAG;
+}
+
+/// ClassifyGlobalReference - Classify a global variable reference for the
+/// current subtarget according to how we should reference it in a non-pcrel
+/// context.
+unsigned char X86Subtarget::
+ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
+ // DLLImport only exists on windows, it is implemented as a load from a
+ // DLLIMPORT stub.
+ if (GV->hasDLLImportLinkage())
+ return X86II::MO_DLLIMPORT;
+
+ // Materializable GVs (in JIT lazy compilation mode) do not require an
+ // extra load from stub.
+ bool isDecl = GV->isDeclaration() && !GV->isMaterializable();
+
+ // X86-64 in PIC mode.
+ if (isPICStyleRIPRel()) {
+ // Large model never uses stubs.
+ if (TM.getCodeModel() == CodeModel::Large)
+ return X86II::MO_NO_FLAG;
+
+ if (isTargetDarwin()) {
+ // If symbol visibility is hidden, the extra load is not needed if
+ // target is x86-64 or the symbol is definitely defined in the current
+ // translation unit.
+ if (GV->hasDefaultVisibility() &&
+ (isDecl || GV->isWeakForLinker()))
+ return X86II::MO_GOTPCREL;
+ } else {
+ assert(isTargetELF() && "Unknown rip-relative target");
+
+ // Extra load is needed for all externally visible.
+ if (!GV->hasLocalLinkage() && GV->hasDefaultVisibility())
+ return X86II::MO_GOTPCREL;
+ }
+
+ return X86II::MO_NO_FLAG;
+ }
+
+ if (isPICStyleGOT()) { // 32-bit ELF targets.
+ // Extra load is needed for all externally visible.
+ if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+ return X86II::MO_GOTOFF;
+ return X86II::MO_GOT;
+ }
+
+ if (isPICStyleStubPIC()) { // Darwin/32 in PIC mode.
+ // Determine whether we have a stub reference and/or whether the reference
+ // is relative to the PIC base or not.
+
+ // If this is a strong reference to a definition, it is definitely not
+ // through a stub.
+ if (!isDecl && !GV->isWeakForLinker())
+ return X86II::MO_PIC_BASE_OFFSET;
+
+ // Unless we have a symbol with hidden visibility, we have to go through a
+ // normal $non_lazy_ptr stub because this symbol might be resolved late.
+ if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference.
+ return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+
+ // If symbol visibility is hidden, we have a stub for common symbol
+ // references and external declarations.
+ if (isDecl || GV->hasCommonLinkage()) {
+ // Hidden $non_lazy_ptr reference.
+ return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE;
+ }
+
+ // Otherwise, no stub.
+ return X86II::MO_PIC_BASE_OFFSET;
+ }
+
+ if (isPICStyleStubNoDynamic()) { // Darwin/32 in -mdynamic-no-pic mode.
+ // Determine whether we have a stub reference.
+
+ // If this is a strong reference to a definition, it is definitely not
+ // through a stub.
+ if (!isDecl && !GV->isWeakForLinker())
+ return X86II::MO_NO_FLAG;
+
+ // Unless we have a symbol with hidden visibility, we have to go through a
+ // normal $non_lazy_ptr stub because this symbol might be resolved late.
+ if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference.
+ return X86II::MO_DARWIN_NONLAZY;
+
+ // Otherwise, no stub.
+ return X86II::MO_NO_FLAG;
+ }
+
+ // Direct static reference to global.
+ return X86II::MO_NO_FLAG;
+}
+
+
+/// getBZeroEntry - This function returns the name of a function which has an
+/// interface like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over memset with zero
+/// passed as the second argument. Otherwise it returns null.
+const char *X86Subtarget::getBZeroEntry() const {
+ // Darwin 10 has a __bzero entry point for this purpose.
+ if (getDarwinVers() >= 10)
+ return "__bzero";
+
+ return 0;
+}
+
+/// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
+/// to immediate address.
+bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
+ if (Is64Bit)
+ return false;
+ return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
+}
+
+/// getSpecialAddressLatency - For targets where it is beneficial to
+/// backschedule instructions that compute addresses, return a value
+/// indicating the number of scheduling cycles of backscheduling that
+/// should be attempted.
+unsigned X86Subtarget::getSpecialAddressLatency() const {
+ // For x86 out-of-order targets, back-schedule address computations so
+ // that loads and stores aren't blocked.
+ // This value was chosen arbitrarily.
+ return 200;
+}
+
+/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
+/// specified arguments. If we can't run cpuid on the host, return true.
+static bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX,
+ unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
+#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+ #if defined(__GNUC__)
+ // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+ asm ("movq\t%%rbx, %%rsi\n\t"
+ "cpuid\n\t"
+ "xchgq\t%%rbx, %%rsi\n\t"
+ : "=a" (*rEAX),
+ "=S" (*rEBX),
+ "=c" (*rECX),
+ "=d" (*rEDX)
+ : "a" (value));
+ return false;
+ #elif defined(_MSC_VER)
+ int registers[4];
+ __cpuid(registers, value);
+ *rEAX = registers[0];
+ *rEBX = registers[1];
+ *rECX = registers[2];
+ *rEDX = registers[3];
+ return false;
+ #endif
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+ #if defined(__GNUC__)
+ asm ("movl\t%%ebx, %%esi\n\t"
+ "cpuid\n\t"
+ "xchgl\t%%ebx, %%esi\n\t"
+ : "=a" (*rEAX),
+ "=S" (*rEBX),
+ "=c" (*rECX),
+ "=d" (*rEDX)
+ : "a" (value));
+ return false;
+ #elif defined(_MSC_VER)
+ __asm {
+ mov eax,value
+ cpuid
+ mov esi,rEAX
+ mov dword ptr [esi],eax
+ mov esi,rEBX
+ mov dword ptr [esi],ebx
+ mov esi,rECX
+ mov dword ptr [esi],ecx
+ mov esi,rEDX
+ mov dword ptr [esi],edx
+ }
+ return false;
+ #endif
+#endif
+ return true;
+}
+
+static void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model) {
+ Family = (EAX >> 8) & 0xf; // Bits 8 - 11
+ Model = (EAX >> 4) & 0xf; // Bits 4 - 7
+ if (Family == 6 || Family == 0xf) {
+ if (Family == 0xf)
+ // Examine extended family ID if family ID is F.
+ Family += (EAX >> 20) & 0xff; // Bits 20 - 27
+ // Examine extended model ID if family ID is 6 or F.
+ Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19
+ }
+}
+
+void X86Subtarget::AutoDetectSubtargetFeatures() {
+ unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+ union {
+ unsigned u[3];
+ char c[12];
+ } text;
+
+ if (GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1))
+ return;
+
+ GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
+
+ if ((EDX >> 15) & 1) HasCMov = true;
+ if ((EDX >> 23) & 1) X86SSELevel = MMX;
+ if ((EDX >> 25) & 1) X86SSELevel = SSE1;
+ if ((EDX >> 26) & 1) X86SSELevel = SSE2;
+ if (ECX & 0x1) X86SSELevel = SSE3;
+ if ((ECX >> 9) & 1) X86SSELevel = SSSE3;
+ if ((ECX >> 19) & 1) X86SSELevel = SSE41;
+ if ((ECX >> 20) & 1) X86SSELevel = SSE42;
+
+ bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
+ bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
+
+ HasFMA3 = IsIntel && ((ECX >> 12) & 0x1);
+ HasAVX = ((ECX >> 28) & 0x1);
+ HasAES = IsIntel && ((ECX >> 25) & 0x1);
+
+ if (IsIntel || IsAMD) {
+ // Determine if bit test memory instructions are slow.
+ unsigned Family = 0;
+ unsigned Model = 0;
+ DetectFamilyModel(EAX, Family, Model);
+ IsBTMemSlow = IsAMD || (Family == 6 && Model >= 13);
+ // If it's Nehalem, unaligned memory access is fast.
+ if (Family == 15 && Model == 26)
+ IsUAMemFast = true;
+
+ GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+ HasX86_64 = (EDX >> 29) & 0x1;
+ HasSSE4A = IsAMD && ((ECX >> 6) & 0x1);
+ HasFMA4 = IsAMD && ((ECX >> 16) & 0x1);
+ }
+}
+
+X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS,
+ bool is64Bit)
+ : PICStyle(PICStyles::None)
+ , X86SSELevel(NoMMXSSE)
+ , X863DNowLevel(NoThreeDNow)
+ , HasCMov(false)
+ , HasX86_64(false)
+ , HasSSE4A(false)
+ , HasAVX(false)
+ , HasAES(false)
+ , HasFMA3(false)
+ , HasFMA4(false)
+ , IsBTMemSlow(false)
+ , IsUAMemFast(false)
+ , HasVectorUAMem(false)
+ , DarwinVers(0)
+ , stackAlignment(8)
+ // FIXME: this is a known good value for Yonah. How about others?
+ , MaxInlineSizeThreshold(128)
+ , Is64Bit(is64Bit)
+ , TargetType(isELF) { // Default to ELF unless otherwise specified.
+
+ // default to hard float ABI
+ if (FloatABIType == FloatABI::Default)
+ FloatABIType = FloatABI::Hard;
+
+ // Determine default and user specified characteristics
+ if (!FS.empty()) {
+ // If feature string is not empty, parse features string.
+ std::string CPU = sys::getHostCPUName();
+ ParseSubtargetFeatures(FS, CPU);
+ // All X86-64 CPUs also have SSE2, however user might request no SSE via
+ // -mattr, so don't force SSELevel here.
+ } else {
+ // Otherwise, use CPUID to auto-detect feature set.
+ AutoDetectSubtargetFeatures();
+ // Make sure SSE2 is enabled; it is available on all X86-64 CPUs.
+ if (Is64Bit && X86SSELevel < SSE2)
+ X86SSELevel = SSE2;
+ }
+
+ // If requesting codegen for X86-64, make sure that 64-bit features
+ // are enabled.
+ if (Is64Bit) {
+ HasX86_64 = true;
+
+ // All 64-bit cpus have cmov support.
+ HasCMov = true;
+ }
+
+
+ DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
+ << ", 3DNowLevel " << X863DNowLevel
+ << ", 64bit " << HasX86_64 << "\n");
+ assert((!Is64Bit || HasX86_64) &&
+ "64-bit code requested on a subtarget that doesn't support it!");
+
+ // Set the boolean corresponding to the current target triple, or the default
+ // if one cannot be determined, to true.
+ if (TT.length() > 5) {
+ size_t Pos;
+ if ((Pos = TT.find("-darwin")) != std::string::npos) {
+ TargetType = isDarwin;
+
+ // Compute the darwin version number.
+ if (isdigit(TT[Pos+7]))
+ DarwinVers = atoi(&TT[Pos+7]);
+ else
+ DarwinVers = 8; // Minimum supported darwin is Tiger.
+ } else if (TT.find("linux") != std::string::npos) {
+ // Linux doesn't imply ELF, but we don't currently support anything else.
+ TargetType = isELF;
+ } else if (TT.find("cygwin") != std::string::npos) {
+ TargetType = isCygwin;
+ } else if (TT.find("mingw") != std::string::npos) {
+ TargetType = isMingw;
+ } else if (TT.find("win32") != std::string::npos) {
+ TargetType = isWindows;
+ } else if (TT.find("windows") != std::string::npos) {
+ TargetType = isWindows;
+ } else if (TT.find("-cl") != std::string::npos) {
+ TargetType = isDarwin;
+ DarwinVers = 9;
+ }
+ }
+
+ // Stack alignment is 16 bytes on Darwin (both 32 and 64 bit) and for all 64
+ // bit targets.
+ if (TargetType == isDarwin || Is64Bit)
+ stackAlignment = 16;
+
+ if (StackAlignment)
+ stackAlignment = StackAlignment;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h
new file mode 100644
index 0000000..646af91
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h
@@ -0,0 +1,244 @@
+//=====---- X86Subtarget.h - Define Subtarget for the X86 -----*- C++ -*--====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86SUBTARGET_H
+#define X86SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include <string>
+
+namespace llvm {
+class GlobalValue;
+class TargetMachine;
+
+/// PICStyles - The X86 backend supports a number of different styles of PIC.
+///
+namespace PICStyles {
+enum Style {
+ StubPIC, // Used on i386-darwin in -fPIC mode.
+ StubDynamicNoPIC, // Used on i386-darwin in -mdynamic-no-pic mode.
+ GOT, // Used on many 32-bit unices in -fPIC mode.
+ RIPRel, // Used on X86-64 when not in -static mode.
+ None // Set when in -static mode (not PIC or DynamicNoPIC mode).
+};
+}
+
+class X86Subtarget : public TargetSubtarget {
+protected:
+ enum X86SSEEnum {
+ NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42
+ };
+
+ enum X863DNowEnum {
+ NoThreeDNow, ThreeDNow, ThreeDNowA
+ };
+
+ /// PICStyle - Which PIC style to use
+ ///
+ PICStyles::Style PICStyle;
+
+ /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or
+ /// none supported.
+ X86SSEEnum X86SSELevel;
+
+ /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported.
+ ///
+ X863DNowEnum X863DNowLevel;
+
+ /// HasCMov - True if this processor has conditional move instructions
+ /// (generally pentium pro+).
+ bool HasCMov;
+
+ /// HasX86_64 - True if the processor supports X86-64 instructions.
+ ///
+ bool HasX86_64;
+
+ /// HasSSE4A - True if the processor supports SSE4A instructions.
+ bool HasSSE4A;
+
+ /// HasAVX - Target has AVX instructions
+ bool HasAVX;
+
+ /// HasAES - Target has AES instructions
+ bool HasAES;
+
+ /// HasFMA3 - Target has 3-operand fused multiply-add
+ bool HasFMA3;
+
+ /// HasFMA4 - Target has 4-operand fused multiply-add
+ bool HasFMA4;
+
+ /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
+ bool IsBTMemSlow;
+
+ /// IsUAMemFast - True if unaligned memory access is fast.
+ bool IsUAMemFast;
+
+ /// HasVectorUAMem - True if SIMD operations can have unaligned memory
+ /// operands. This may require setting a feature bit in the processor.
+ bool HasVectorUAMem;
+
+ /// DarwinVers - Nonzero if this is a darwin platform: the numeric
+ /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc.
+ unsigned char DarwinVers; // Is any darwin-x86 platform.
+
+ /// stackAlignment - The minimum alignment known to hold of the stack frame on
+ /// entry to the function and which must be maintained by every function.
+ unsigned stackAlignment;
+
+ /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
+ ///
+ unsigned MaxInlineSizeThreshold;
+
+private:
+ /// Is64Bit - True if the processor supports 64-bit instructions and
+ /// pointer size is 64 bit.
+ bool Is64Bit;
+
+public:
+ enum {
+ isELF, isCygwin, isDarwin, isWindows, isMingw
+ } TargetType;
+
+ /// This constructor initializes the data members to match that
+ /// of the specified triple.
+ ///
+ X86Subtarget(const std::string &TT, const std::string &FS, bool is64Bit);
+
+ /// getStackAlignment - Returns the minimum alignment known to hold of the
+ /// stack frame on entry to the function and which must be maintained by every
+ /// function for this subtarget.
+ unsigned getStackAlignment() const { return stackAlignment; }
+
+ /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+ /// that still makes it profitable to inline the call.
+ unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }
+
+ /// ParseSubtargetFeatures - Parses features string setting specified
+ /// subtarget options. Definition of function is auto generated by tblgen.
+ std::string ParseSubtargetFeatures(const std::string &FS,
+ const std::string &CPU);
+
+ /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID
+ /// instruction.
+ void AutoDetectSubtargetFeatures();
+
+ bool is64Bit() const { return Is64Bit; }
+
+ PICStyles::Style getPICStyle() const { return PICStyle; }
+ void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }
+
+ bool hasCMov() const { return HasCMov; }
+ bool hasMMX() const { return X86SSELevel >= MMX; }
+ bool hasSSE1() const { return X86SSELevel >= SSE1; }
+ bool hasSSE2() const { return X86SSELevel >= SSE2; }
+ bool hasSSE3() const { return X86SSELevel >= SSE3; }
+ bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+ bool hasSSE41() const { return X86SSELevel >= SSE41; }
+ bool hasSSE42() const { return X86SSELevel >= SSE42; }
+ bool hasSSE4A() const { return HasSSE4A; }
+ bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
+ bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
+ bool hasAVX() const { return HasAVX; }
+ bool hasAES() const { return HasAES; }
+ bool hasFMA3() const { return HasFMA3; }
+ bool hasFMA4() const { return HasFMA4; }
+ bool isBTMemSlow() const { return IsBTMemSlow; }
+ bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
+ bool hasVectorUAMem() const { return HasVectorUAMem; }
+
+ bool isTargetDarwin() const { return TargetType == isDarwin; }
+ bool isTargetELF() const { return TargetType == isELF; }
+
+ bool isTargetWindows() const { return TargetType == isWindows; }
+ bool isTargetMingw() const { return TargetType == isMingw; }
+ bool isTargetCygwin() const { return TargetType == isCygwin; }
+ bool isTargetCygMing() const {
+ return TargetType == isMingw || TargetType == isCygwin;
+ }
+
+ /// isTargetCOFF - Return true if this is any COFF/Windows target variant.
+ bool isTargetCOFF() const {
+ return TargetType == isMingw || TargetType == isCygwin ||
+ TargetType == isWindows;
+ }
+
+ bool isTargetWin64() const {
+ return Is64Bit && (TargetType == isMingw || TargetType == isWindows);
+ }
+
+ std::string getDataLayout() const {
+ const char *p;
+ if (is64Bit())
+ p = "e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-n8:16:32:64";
+ else if (isTargetDarwin())
+ p = "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-n8:16:32";
+ else if (isTargetMingw() || isTargetWindows())
+ p = "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-n8:16:32";
+ else
+ p = "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-n8:16:32";
+
+ return std::string(p);
+ }
+
+ bool isPICStyleSet() const { return PICStyle != PICStyles::None; }
+ bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
+ bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
+
+ bool isPICStyleStubPIC() const {
+ return PICStyle == PICStyles::StubPIC;
+ }
+
+ bool isPICStyleStubNoDynamic() const {
+ return PICStyle == PICStyles::StubDynamicNoPIC;
+ }
+ bool isPICStyleStubAny() const {
+ return PICStyle == PICStyles::StubDynamicNoPIC ||
+ PICStyle == PICStyles::StubPIC; }
+
+ /// getDarwinVers - Return the darwin version number, 8 = Tiger, 9 = Leopard,
+ /// 10 = Snow Leopard, etc.
+ unsigned getDarwinVers() const { return DarwinVers; }
+
+ /// ClassifyGlobalReference - Classify a global variable reference for the
+ /// current subtarget according to how we should reference it in a non-pcrel
+ /// context.
+ unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+ const TargetMachine &TM)const;
+
+ /// ClassifyBlockAddressReference - Classify a blockaddress reference for the
+ /// current subtarget according to how we should reference it in a non-pcrel
+ /// context.
+ unsigned char ClassifyBlockAddressReference() const;
+
+ /// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
+ /// to immediate address.
+ bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const;
+
+ /// This function returns the name of a function which has an interface
+ /// like the non-standard bzero function, if such a function exists on
+ /// the current subtarget and it is considered prefereable over
+ /// memset with zero passed as the second argument. Otherwise it
+ /// returns null.
+ const char *getBZeroEntry() const;
+
+ /// getSpecialAddressLatency - For targets where it is beneficial to
+ /// backschedule instructions that compute addresses, return a value
+ /// indicating the number of scheduling cycles of backscheduling that
+ /// should be attempted.
+ unsigned getSpecialAddressLatency() const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
new file mode 100644
index 0000000..f2c5058
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -0,0 +1,238 @@
+//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCAsmInfo.h"
+#include "X86TargetMachine.h"
+#include "X86.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
+ Triple TheTriple(TT);
+ switch (TheTriple.getOS()) {
+ case Triple::Darwin:
+ return new X86MCAsmInfoDarwin(TheTriple);
+ case Triple::MinGW32:
+ case Triple::MinGW64:
+ case Triple::Cygwin:
+ case Triple::Win32:
+ return new X86MCAsmInfoCOFF(TheTriple);
+ default:
+ return new X86ELFMCAsmInfo(TheTriple);
+ }
+}
+
+static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
+ MCContext &Ctx, TargetAsmBackend &TAB,
+ raw_ostream &_OS,
+ MCCodeEmitter *_Emitter,
+ bool RelaxAll) {
+ Triple TheTriple(TT);
+ switch (TheTriple.getOS()) {
+ default:
+ return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll);
+ }
+}
+
+extern "C" void LLVMInitializeX86Target() {
+ // Register the target.
+ RegisterTargetMachine<X86_32TargetMachine> X(TheX86_32Target);
+ RegisterTargetMachine<X86_64TargetMachine> Y(TheX86_64Target);
+
+ // Register the target asm info.
+ RegisterAsmInfoFn A(TheX86_32Target, createMCAsmInfo);
+ RegisterAsmInfoFn B(TheX86_64Target, createMCAsmInfo);
+
+ // Register the code emitter.
+ TargetRegistry::RegisterCodeEmitter(TheX86_32Target,
+ createX86_32MCCodeEmitter);
+ TargetRegistry::RegisterCodeEmitter(TheX86_64Target,
+ createX86_64MCCodeEmitter);
+
+ // Register the asm backend.
+ TargetRegistry::RegisterAsmBackend(TheX86_32Target,
+ createX86_32AsmBackend);
+ TargetRegistry::RegisterAsmBackend(TheX86_64Target,
+ createX86_64AsmBackend);
+
+ // Register the object streamer.
+ TargetRegistry::RegisterObjectStreamer(TheX86_32Target,
+ createMCStreamer);
+ TargetRegistry::RegisterObjectStreamer(TheX86_64Target,
+ createMCStreamer);
+}
+
+
+X86_32TargetMachine::X86_32TargetMachine(const Target &T, const std::string &TT,
+ const std::string &FS)
+ : X86TargetMachine(T, TT, FS, false) {
+}
+
+
+X86_64TargetMachine::X86_64TargetMachine(const Target &T, const std::string &TT,
+ const std::string &FS)
+ : X86TargetMachine(T, TT, FS, true) {
+}
+
+/// X86TargetMachine ctor - Create an X86 target.
+///
+X86TargetMachine::X86TargetMachine(const Target &T, const std::string &TT,
+ const std::string &FS, bool is64Bit)
+ : LLVMTargetMachine(T, TT),
+ Subtarget(TT, FS, is64Bit),
+ DataLayout(Subtarget.getDataLayout()),
+ FrameInfo(TargetFrameInfo::StackGrowsDown,
+ Subtarget.getStackAlignment(),
+ (Subtarget.isTargetWin64() ? -40 :
+ (Subtarget.is64Bit() ? -8 : -4))),
+ InstrInfo(*this), JITInfo(*this), TLInfo(*this), TSInfo(*this),
+ ELFWriterInfo(*this) {
+ DefRelocModel = getRelocationModel();
+
+ // If no relocation model was picked, default as appropriate for the target.
+ if (getRelocationModel() == Reloc::Default) {
+ if (!Subtarget.isTargetDarwin())
+ setRelocationModel(Reloc::Static);
+ else if (Subtarget.is64Bit())
+ setRelocationModel(Reloc::PIC_);
+ else
+ setRelocationModel(Reloc::DynamicNoPIC);
+ }
+
+ assert(getRelocationModel() != Reloc::Default &&
+ "Relocation mode not picked");
+
+ // ELF and X86-64 don't have a distinct DynamicNoPIC model. DynamicNoPIC
+ // is defined as a model for code which may be used in static or dynamic
+ // executables but not necessarily a shared library. On X86-32 we just
+ // compile in -static mode, in x86-64 we use PIC.
+ if (getRelocationModel() == Reloc::DynamicNoPIC) {
+ if (is64Bit)
+ setRelocationModel(Reloc::PIC_);
+ else if (!Subtarget.isTargetDarwin())
+ setRelocationModel(Reloc::Static);
+ }
+
+ // If we are on Darwin, disallow static relocation model in X86-64 mode, since
+ // the Mach-O file format doesn't support it.
+ if (getRelocationModel() == Reloc::Static &&
+ Subtarget.isTargetDarwin() &&
+ is64Bit)
+ setRelocationModel(Reloc::PIC_);
+
+ // Determine the PICStyle based on the target selected.
+ if (getRelocationModel() == Reloc::Static) {
+ // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
+ Subtarget.setPICStyle(PICStyles::None);
+ } else if (Subtarget.isTargetCygMing()) {
+ Subtarget.setPICStyle(PICStyles::None);
+ } else if (Subtarget.isTargetDarwin()) {
+ if (Subtarget.is64Bit())
+ Subtarget.setPICStyle(PICStyles::RIPRel);
+ else if (getRelocationModel() == Reloc::PIC_)
+ Subtarget.setPICStyle(PICStyles::StubPIC);
+ else {
+ assert(getRelocationModel() == Reloc::DynamicNoPIC);
+ Subtarget.setPICStyle(PICStyles::StubDynamicNoPIC);
+ }
+ } else if (Subtarget.isTargetELF()) {
+ if (Subtarget.is64Bit())
+ Subtarget.setPICStyle(PICStyles::RIPRel);
+ else
+ Subtarget.setPICStyle(PICStyles::GOT);
+ }
+
+ // Finally, if we have "none" as our PIC style, force to static mode.
+ if (Subtarget.getPICStyle() == PICStyles::None)
+ setRelocationModel(Reloc::Static);
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool X86TargetMachine::addInstSelector(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel) {
+ // Install an instruction selector.
+ PM.add(createX86ISelDag(*this, OptLevel));
+
+ // Install a pass to insert x87 FP_REG_KILL instructions, as needed.
+ PM.add(createX87FPRegKillInserterPass());
+
+ return false;
+}
+
+bool X86TargetMachine::addPreRegAlloc(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel) {
+ PM.add(createX86MaxStackAlignmentHeuristicPass());
+ return false; // -print-machineinstr shouldn't print after this.
+}
+
+bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel) {
+ PM.add(createX86FloatingPointStackifierPass());
+ return true; // -print-machineinstr should print after this.
+}
+
+bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel) {
+ if (OptLevel != CodeGenOpt::None && Subtarget.hasSSE2()) {
+ PM.add(createSSEDomainFixPass());
+ return true;
+ }
+ return false;
+}
+
+bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
+ CodeGenOpt::Level OptLevel,
+ JITCodeEmitter &JCE) {
+ // FIXME: Move this to TargetJITInfo!
+ // On Darwin, do not override 64-bit setting made in X86TargetMachine().
+ if (DefRelocModel == Reloc::Default &&
+ (!Subtarget.isTargetDarwin() || !Subtarget.is64Bit())) {
+ setRelocationModel(Reloc::Static);
+ Subtarget.setPICStyle(PICStyles::None);
+ }
+
+
+ PM.add(createX86JITCodeEmitterPass(*this, JCE));
+
+ return false;
+}
+
+void X86TargetMachine::setCodeModelForStatic() {
+
+ if (getCodeModel() != CodeModel::Default) return;
+
+ // For static codegen, if we're not already set, use Small codegen.
+ setCodeModel(CodeModel::Small);
+}
+
+
+void X86TargetMachine::setCodeModelForJIT() {
+
+ if (getCodeModel() != CodeModel::Default) return;
+
+ // 64-bit JIT places everything in the same buffer except external functions.
+ if (Subtarget.is64Bit())
+ setCodeModel(CodeModel::Large);
+ else
+ setCodeModel(CodeModel::Small);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
new file mode 100644
index 0000000..f9fb424
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
@@ -0,0 +1,97 @@
+//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETMACHINE_H
+#define X86TARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "X86.h"
+#include "X86ELFWriterInfo.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86Subtarget.h"
+#include "X86ISelLowering.h"
+#include "X86SelectionDAGInfo.h"
+
+namespace llvm {
+
+class formatted_raw_ostream;
+
+class X86TargetMachine : public LLVMTargetMachine {
+ X86Subtarget Subtarget;
+ const TargetData DataLayout; // Calculates type size & alignment
+ TargetFrameInfo FrameInfo;
+ X86InstrInfo InstrInfo;
+ X86JITInfo JITInfo;
+ X86TargetLowering TLInfo;
+ X86SelectionDAGInfo TSInfo;
+ X86ELFWriterInfo ELFWriterInfo;
+ Reloc::Model DefRelocModel; // Reloc model before it's overridden.
+
+private:
+ // We have specific defaults for X86.
+ virtual void setCodeModelForJIT();
+ virtual void setCodeModelForStatic();
+
+public:
+ X86TargetMachine(const Target &T, const std::string &TT,
+ const std::string &FS, bool is64Bit);
+
+ virtual const X86InstrInfo *getInstrInfo() const { return &InstrInfo; }
+ virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; }
+ virtual X86JITInfo *getJITInfo() { return &JITInfo; }
+ virtual const X86Subtarget *getSubtargetImpl() const{ return &Subtarget; }
+ virtual const X86TargetLowering *getTargetLowering() const {
+ return &TLInfo;
+ }
+ virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
+ return &TSInfo;
+ }
+ virtual const X86RegisterInfo *getRegisterInfo() const {
+ return &InstrInfo.getRegisterInfo();
+ }
+ virtual const TargetData *getTargetData() const { return &DataLayout; }
+ virtual const X86ELFWriterInfo *getELFWriterInfo() const {
+ return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
+ }
+
+ // Set up the pass pipeline.
+ virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+ virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+ virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+ virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+ virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+ JITCodeEmitter &JCE);
+};
+
+/// X86_32TargetMachine - X86 32-bit target machine.
+///
+class X86_32TargetMachine : public X86TargetMachine {
+public:
+ X86_32TargetMachine(const Target &T, const std::string &M,
+ const std::string &FS);
+};
+
+/// X86_64TargetMachine - X86 64-bit target machine.
+///
+class X86_64TargetMachine : public X86TargetMachine {
+public:
+ X86_64TargetMachine(const Target &T, const std::string &TT,
+ const std::string &FS);
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
new file mode 100644
index 0000000..c15dfbb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -0,0 +1,118 @@
+//===-- llvm/Target/X86/X86TargetObjectFile.cpp - X86 Object Info ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetObjectFile.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Dwarf.h"
+using namespace llvm;
+using namespace dwarf;
+
+const MCExpr *X8664_MachoTargetObjectFile::
+getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
+ MachineModuleInfo *MMI, unsigned Encoding,
+ MCStreamer &Streamer) const {
+
+ // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
+ // is an indirect pc-relative reference.
+ if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+ const MCSymbol *Sym = Mang->getSymbol(GV);
+ const MCExpr *Res =
+ MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+ const MCExpr *Four = MCConstantExpr::Create(4, getContext());
+ return MCBinaryExpr::CreateAdd(Res, Four, getContext());
+ }
+
+ return TargetLoweringObjectFileMachO::
+ getExprForDwarfGlobalReference(GV, Mang, MMI, Encoding, Streamer);
+}
+
+unsigned X8632_ELFTargetObjectFile::getPersonalityEncoding() const {
+ if (TM.getRelocationModel() == Reloc::PIC_)
+ return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
+ else
+ return DW_EH_PE_absptr;
+}
+
+unsigned X8632_ELFTargetObjectFile::getLSDAEncoding() const {
+ if (TM.getRelocationModel() == Reloc::PIC_)
+ return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
+ else
+ return DW_EH_PE_absptr;
+}
+
+unsigned X8632_ELFTargetObjectFile::getFDEEncoding() const {
+ if (TM.getRelocationModel() == Reloc::PIC_)
+ return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
+ else
+ return DW_EH_PE_absptr;
+}
+
+unsigned X8632_ELFTargetObjectFile::getTTypeEncoding() const {
+ if (TM.getRelocationModel() == Reloc::PIC_)
+ return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
+ else
+ return DW_EH_PE_absptr;
+}
+
+unsigned X8664_ELFTargetObjectFile::getPersonalityEncoding() const {
+ CodeModel::Model Model = TM.getCodeModel();
+ if (TM.getRelocationModel() == Reloc::PIC_)
+ return DW_EH_PE_indirect | DW_EH_PE_pcrel | (Model == CodeModel::Small ||
+ Model == CodeModel::Medium ?
+ DW_EH_PE_sdata4 : DW_EH_PE_sdata8);
+
+ if (Model == CodeModel::Small || Model == CodeModel::Medium)
+ return DW_EH_PE_udata4;
+
+ return DW_EH_PE_absptr;
+}
+
+unsigned X8664_ELFTargetObjectFile::getLSDAEncoding() const {
+ CodeModel::Model Model = TM.getCodeModel();
+ if (TM.getRelocationModel() == Reloc::PIC_)
+ return DW_EH_PE_pcrel | (Model == CodeModel::Small ?
+ DW_EH_PE_sdata4 : DW_EH_PE_sdata8);
+
+ if (Model == CodeModel::Small)
+ return DW_EH_PE_udata4;
+
+ return DW_EH_PE_absptr;
+}
+
+unsigned X8664_ELFTargetObjectFile::getFDEEncoding() const {
+ CodeModel::Model Model = TM.getCodeModel();
+ if (TM.getRelocationModel() == Reloc::PIC_)
+ return DW_EH_PE_pcrel | (Model == CodeModel::Small ||
+ Model == CodeModel::Medium ?
+ DW_EH_PE_sdata4 : DW_EH_PE_sdata8);
+
+ if (Model == CodeModel::Small || Model == CodeModel::Medium)
+ return DW_EH_PE_udata4;
+
+ return DW_EH_PE_absptr;
+}
+
+unsigned X8664_ELFTargetObjectFile::getTTypeEncoding() const {
+ CodeModel::Model Model = TM.getCodeModel();
+ if (TM.getRelocationModel() == Reloc::PIC_)
+ return DW_EH_PE_indirect | DW_EH_PE_pcrel | (Model == CodeModel::Small ||
+ Model == CodeModel::Medium ?
+ DW_EH_PE_sdata4 : DW_EH_PE_sdata8);
+
+ if (Model == CodeModel::Small)
+ return DW_EH_PE_udata4;
+
+ return DW_EH_PE_absptr;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
new file mode 100644
index 0000000..f2fd49c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -0,0 +1,54 @@
+//===-- llvm/Target/X86/X86TargetObjectFile.h - X86 Object Info -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_X86_TARGETOBJECTFILE_H
+#define LLVM_TARGET_X86_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+ class X86TargetMachine;
+
+ /// X8664_MachoTargetObjectFile - This TLOF implementation is used for Darwin
+ /// x86-64.
+ class X8664_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+ public:
+ virtual const MCExpr *
+ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
+ MachineModuleInfo *MMI, unsigned Encoding,
+ MCStreamer &Streamer) const;
+ };
+
+ class X8632_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+ const X86TargetMachine &TM;
+ public:
+ X8632_ELFTargetObjectFile(const X86TargetMachine &tm)
+ :TM(tm) { }
+ virtual unsigned getPersonalityEncoding() const;
+ virtual unsigned getLSDAEncoding() const;
+ virtual unsigned getFDEEncoding() const;
+ virtual unsigned getTTypeEncoding() const;
+ };
+
+ class X8664_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+ const X86TargetMachine &TM;
+ public:
+ X8664_ELFTargetObjectFile(const X86TargetMachine &tm)
+ :TM(tm) { }
+ virtual unsigned getPersonalityEncoding() const;
+ virtual unsigned getLSDAEncoding() const;
+ virtual unsigned getFDEEncoding() const;
+ virtual unsigned getTTypeEncoding() const;
+ };
+
+} // end namespace llvm
+
+#endif
OpenPOWER on IntegriCloud