diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
79 files changed, 59959 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmLexer.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmLexer.cpp new file mode 100644 index 0000000..1eaccff --- /dev/null +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmLexer.cpp @@ -0,0 +1,165 @@ +//===-- X86AsmLexer.cpp - Tokenize X86 assembly to AsmTokens --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86BaseInfo.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCTargetAsmLexer.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" + +using namespace llvm; + +namespace { + +class X86AsmLexer : public MCTargetAsmLexer { + const MCAsmInfo &AsmInfo; + + bool tentativeIsValid; + AsmToken tentativeToken; + + const AsmToken &lexTentative() { + tentativeToken = getLexer()->Lex(); + tentativeIsValid = true; + return tentativeToken; + } + + const AsmToken &lexDefinite() { + if (tentativeIsValid) { + tentativeIsValid = false; + return tentativeToken; + } + return getLexer()->Lex(); + } + + AsmToken LexTokenATT(); + AsmToken LexTokenIntel(); +protected: + AsmToken LexToken() { + if (!Lexer) { + SetError(SMLoc(), "No MCAsmLexer installed"); + return AsmToken(AsmToken::Error, "", 0); + } + + switch (AsmInfo.getAssemblerDialect()) { + default: + SetError(SMLoc(), "Unhandled dialect"); + return AsmToken(AsmToken::Error, "", 0); + case 0: + return LexTokenATT(); + case 1: + return LexTokenIntel(); + } + } +public: + X86AsmLexer(const Target &T, const MCRegisterInfo &MRI, const MCAsmInfo &MAI) + : MCTargetAsmLexer(T), AsmInfo(MAI), tentativeIsValid(false) { + } +}; + +} // end anonymous namespace + +#define GET_REGISTER_MATCHER +#include "X86GenAsmMatcher.inc" + +AsmToken X86AsmLexer::LexTokenATT() { + AsmToken lexedToken = lexDefinite(); + + switch (lexedToken.getKind()) { + default: + return lexedToken; + case AsmToken::Error: + SetError(Lexer->getErrLoc(), Lexer->getErr()); + return lexedToken; + + case AsmToken::Percent: { + const AsmToken &nextToken = lexTentative(); + if (nextToken.getKind() != AsmToken::Identifier) + return lexedToken; + + + if (unsigned regID = MatchRegisterName(nextToken.getString())) { + lexDefinite(); + + // FIXME: This is completely wrong when there is a space or other + // punctuation between the % and the register name. + StringRef regStr(lexedToken.getString().data(), + lexedToken.getString().size() + + nextToken.getString().size()); + + return AsmToken(AsmToken::Register, regStr, + static_cast<int64_t>(regID)); + } + + // Match register name failed. If this is "db[0-7]", match it as an alias + // for dr[0-7]. + if (nextToken.getString().size() == 3 && + nextToken.getString().startswith("db")) { + int RegNo = -1; + switch (nextToken.getString()[2]) { + case '0': RegNo = X86::DR0; break; + case '1': RegNo = X86::DR1; break; + case '2': RegNo = X86::DR2; break; + case '3': RegNo = X86::DR3; break; + case '4': RegNo = X86::DR4; break; + case '5': RegNo = X86::DR5; break; + case '6': RegNo = X86::DR6; break; + case '7': RegNo = X86::DR7; break; + } + + if (RegNo != -1) { + lexDefinite(); + + // FIXME: This is completely wrong when there is a space or other + // punctuation between the % and the register name. + StringRef regStr(lexedToken.getString().data(), + lexedToken.getString().size() + + nextToken.getString().size()); + return AsmToken(AsmToken::Register, regStr, + static_cast<int64_t>(RegNo)); + } + } + + + return lexedToken; + } + } +} + +AsmToken X86AsmLexer::LexTokenIntel() { + const AsmToken &lexedToken = lexDefinite(); + + switch(lexedToken.getKind()) { + default: + return lexedToken; + case AsmToken::Error: + SetError(Lexer->getErrLoc(), Lexer->getErr()); + return lexedToken; + case AsmToken::Identifier: { + std::string upperCase = lexedToken.getString().str(); + std::string lowerCase = LowercaseString(upperCase); + StringRef lowerRef(lowerCase); + + unsigned regID = MatchRegisterName(lowerRef); + + if (regID) + return AsmToken(AsmToken::Register, + lexedToken.getString(), + static_cast<int64_t>(regID)); + return lexedToken; + } + } +} + +extern "C" void LLVMInitializeX86AsmLexer() { + RegisterMCAsmLexer<X86AsmLexer> X(TheX86_32Target); + RegisterMCAsmLexer<X86AsmLexer> Y(TheX86_64Target); +} diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp new file mode 100644 index 0000000..cb4f15f --- /dev/null +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -0,0 +1,1189 @@ +//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86BaseInfo.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { +struct X86Operand; + +class X86ATTAsmParser : public MCTargetAsmParser { + MCSubtargetInfo &STI; + MCAsmParser &Parser; + +private: + MCAsmParser &getParser() const { return Parser; } + + MCAsmLexer &getLexer() const { return Parser.getLexer(); } + + bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); } + + X86Operand *ParseOperand(); + X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc); + + bool ParseDirectiveWord(unsigned Size, SMLoc L); + bool ParseDirectiveCode(StringRef IDVal, SMLoc L); + + bool MatchAndEmitInstruction(SMLoc IDLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands, + MCStreamer &Out); + + /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi) + /// in 64bit mode or (%edi) or %es:(%edi) in 32bit mode. + bool isSrcOp(X86Operand &Op); + + /// isDstOp - Returns true if operand is either %es:(%rdi) in 64bit mode + /// or %es:(%edi) in 32bit mode. + bool isDstOp(X86Operand &Op); + + bool is64BitMode() const { + // FIXME: Can tablegen auto-generate this? + return (STI.getFeatureBits() & X86::Mode64Bit) != 0; + } + void SwitchMode() { + unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(X86::Mode64Bit)); + setAvailableFeatures(FB); + } + + /// @name Auto-generated Matcher Functions + /// { + +#define GET_ASSEMBLER_HEADER +#include "X86GenAsmMatcher.inc" + + /// } + +public: + X86ATTAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser) + : MCTargetAsmParser(), STI(sti), Parser(parser) { + + // Initialize the set of available features. + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + } + virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc); + + virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands); + + virtual bool ParseDirective(AsmToken DirectiveID); +}; +} // end anonymous namespace + +/// @name Auto-generated Match Functions +/// { + +static unsigned MatchRegisterName(StringRef Name); + +/// } + +namespace { + +/// X86Operand - Instances of this class represent a parsed X86 machine +/// instruction. +struct X86Operand : public MCParsedAsmOperand { + enum KindTy { + Token, + Register, + Immediate, + Memory + } Kind; + + SMLoc StartLoc, EndLoc; + + union { + struct { + const char *Data; + unsigned Length; + } Tok; + + struct { + unsigned RegNo; + } Reg; + + struct { + const MCExpr *Val; + } Imm; + + struct { + unsigned SegReg; + const MCExpr *Disp; + unsigned BaseReg; + unsigned IndexReg; + unsigned Scale; + } Mem; + }; + + X86Operand(KindTy K, SMLoc Start, SMLoc End) + : Kind(K), StartLoc(Start), EndLoc(End) {} + + /// getStartLoc - Get the location of the first token of this operand. + SMLoc getStartLoc() const { return StartLoc; } + /// getEndLoc - Get the location of the last token of this operand. + SMLoc getEndLoc() const { return EndLoc; } + + virtual void print(raw_ostream &OS) const {} + + StringRef getToken() const { + assert(Kind == Token && "Invalid access!"); + return StringRef(Tok.Data, Tok.Length); + } + void setTokenValue(StringRef Value) { + assert(Kind == Token && "Invalid access!"); + Tok.Data = Value.data(); + Tok.Length = Value.size(); + } + + unsigned getReg() const { + assert(Kind == Register && "Invalid access!"); + return Reg.RegNo; + } + + const MCExpr *getImm() const { + assert(Kind == Immediate && "Invalid access!"); + return Imm.Val; + } + + const MCExpr *getMemDisp() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.Disp; + } + unsigned getMemSegReg() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.SegReg; + } + unsigned getMemBaseReg() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.BaseReg; + } + unsigned getMemIndexReg() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.IndexReg; + } + unsigned getMemScale() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.Scale; + } + + bool isToken() const {return Kind == Token; } + + bool isImm() const { return Kind == Immediate; } + + bool isImmSExti16i8() const { + if (!isImm()) + return false; + + // If this isn't a constant expr, just assume it fits and let relaxation + // handle it. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return true; + + // Otherwise, check the value is in a range that makes sense for this + // extension. + uint64_t Value = CE->getValue(); + return (( Value <= 0x000000000000007FULL)|| + (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)|| + (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + } + bool isImmSExti32i8() const { + if (!isImm()) + return false; + + // If this isn't a constant expr, just assume it fits and let relaxation + // handle it. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return true; + + // Otherwise, check the value is in a range that makes sense for this + // extension. + uint64_t Value = CE->getValue(); + return (( Value <= 0x000000000000007FULL)|| + (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)|| + (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + } + bool isImmZExtu32u8() const { + if (!isImm()) + return false; + + // If this isn't a constant expr, just assume it fits and let relaxation + // handle it. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return true; + + // Otherwise, check the value is in a range that makes sense for this + // extension. + uint64_t Value = CE->getValue(); + return (Value <= 0x00000000000000FFULL); + } + bool isImmSExti64i8() const { + if (!isImm()) + return false; + + // If this isn't a constant expr, just assume it fits and let relaxation + // handle it. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return true; + + // Otherwise, check the value is in a range that makes sense for this + // extension. + uint64_t Value = CE->getValue(); + return (( Value <= 0x000000000000007FULL)|| + (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + } + bool isImmSExti64i32() const { + if (!isImm()) + return false; + + // If this isn't a constant expr, just assume it fits and let relaxation + // handle it. + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + if (!CE) + return true; + + // Otherwise, check the value is in a range that makes sense for this + // extension. + uint64_t Value = CE->getValue(); + return (( Value <= 0x000000007FFFFFFFULL)|| + (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + } + + bool isMem() const { return Kind == Memory; } + + bool isAbsMem() const { + return Kind == Memory && !getMemSegReg() && !getMemBaseReg() && + !getMemIndexReg() && getMemScale() == 1; + } + + bool isReg() const { return Kind == Register; } + + void addExpr(MCInst &Inst, const MCExpr *Expr) const { + // Add as immediates when possible. + if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) + Inst.addOperand(MCOperand::CreateImm(CE->getValue())); + else + Inst.addOperand(MCOperand::CreateExpr(Expr)); + } + + void addRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getReg())); + } + + void addImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + addExpr(Inst, getImm()); + } + + void addMemOperands(MCInst &Inst, unsigned N) const { + assert((N == 5) && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getMemBaseReg())); + Inst.addOperand(MCOperand::CreateImm(getMemScale())); + Inst.addOperand(MCOperand::CreateReg(getMemIndexReg())); + addExpr(Inst, getMemDisp()); + Inst.addOperand(MCOperand::CreateReg(getMemSegReg())); + } + + void addAbsMemOperands(MCInst &Inst, unsigned N) const { + assert((N == 1) && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateExpr(getMemDisp())); + } + + static X86Operand *CreateToken(StringRef Str, SMLoc Loc) { + X86Operand *Res = new X86Operand(Token, Loc, Loc); + Res->Tok.Data = Str.data(); + Res->Tok.Length = Str.size(); + return Res; + } + + static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc) { + X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc); + Res->Reg.RegNo = RegNo; + return Res; + } + + static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){ + X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc); + Res->Imm.Val = Val; + return Res; + } + + /// Create an absolute memory operand. + static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, + SMLoc EndLoc) { + X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc); + Res->Mem.SegReg = 0; + Res->Mem.Disp = Disp; + Res->Mem.BaseReg = 0; + Res->Mem.IndexReg = 0; + Res->Mem.Scale = 1; + return Res; + } + + /// Create a generalized memory operand. + static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp, + unsigned BaseReg, unsigned IndexReg, + unsigned Scale, SMLoc StartLoc, SMLoc EndLoc) { + // We should never just have a displacement, that should be parsed as an + // absolute memory operand. + assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!"); + + // The scale should always be one of {1,2,4,8}. + assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) && + "Invalid scale!"); + X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc); + Res->Mem.SegReg = SegReg; + Res->Mem.Disp = Disp; + Res->Mem.BaseReg = BaseReg; + Res->Mem.IndexReg = IndexReg; + Res->Mem.Scale = Scale; + return Res; + } +}; + +} // end anonymous namespace. + +bool X86ATTAsmParser::isSrcOp(X86Operand &Op) { + unsigned basereg = is64BitMode() ? X86::RSI : X86::ESI; + + return (Op.isMem() && + (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::DS) && + isa<MCConstantExpr>(Op.Mem.Disp) && + cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && + Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0); +} + +bool X86ATTAsmParser::isDstOp(X86Operand &Op) { + unsigned basereg = is64BitMode() ? X86::RDI : X86::EDI; + + return Op.isMem() && Op.Mem.SegReg == X86::ES && + isa<MCConstantExpr>(Op.Mem.Disp) && + cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && + Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0; +} + +bool X86ATTAsmParser::ParseRegister(unsigned &RegNo, + SMLoc &StartLoc, SMLoc &EndLoc) { + RegNo = 0; + const AsmToken &TokPercent = Parser.getTok(); + assert(TokPercent.is(AsmToken::Percent) && "Invalid token kind!"); + StartLoc = TokPercent.getLoc(); + Parser.Lex(); // Eat percent token. + + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) + return Error(Tok.getLoc(), "invalid register name"); + + RegNo = MatchRegisterName(Tok.getString()); + + // If the match failed, try the register name as lowercase. + if (RegNo == 0) + RegNo = MatchRegisterName(LowercaseString(Tok.getString())); + + if (!is64BitMode()) { + // FIXME: This should be done using Requires<In32BitMode> and + // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also + // checked. + // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a + // REX prefix. + if (RegNo == X86::RIZ || + X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) || + X86II::isX86_64NonExtLowByteReg(RegNo) || + X86II::isX86_64ExtendedReg(RegNo)) + return Error(Tok.getLoc(), "register %" + + Tok.getString() + " is only available in 64-bit mode"); + } + + // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens. + if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) { + RegNo = X86::ST0; + EndLoc = Tok.getLoc(); + Parser.Lex(); // Eat 'st' + + // Check to see if we have '(4)' after %st. + if (getLexer().isNot(AsmToken::LParen)) + return false; + // Lex the paren. + getParser().Lex(); + + const AsmToken &IntTok = Parser.getTok(); + if (IntTok.isNot(AsmToken::Integer)) + return Error(IntTok.getLoc(), "expected stack index"); + switch (IntTok.getIntVal()) { + case 0: RegNo = X86::ST0; break; + case 1: RegNo = X86::ST1; break; + case 2: RegNo = X86::ST2; break; + case 3: RegNo = X86::ST3; break; + case 4: RegNo = X86::ST4; break; + case 5: RegNo = X86::ST5; break; + case 6: RegNo = X86::ST6; break; + case 7: RegNo = X86::ST7; break; + default: return Error(IntTok.getLoc(), "invalid stack index"); + } + + if (getParser().Lex().isNot(AsmToken::RParen)) + return Error(Parser.getTok().getLoc(), "expected ')'"); + + EndLoc = Tok.getLoc(); + Parser.Lex(); // Eat ')' + return false; + } + + // If this is "db[0-7]", match it as an alias + // for dr[0-7]. + if (RegNo == 0 && Tok.getString().size() == 3 && + Tok.getString().startswith("db")) { + switch (Tok.getString()[2]) { + case '0': RegNo = X86::DR0; break; + case '1': RegNo = X86::DR1; break; + case '2': RegNo = X86::DR2; break; + case '3': RegNo = X86::DR3; break; + case '4': RegNo = X86::DR4; break; + case '5': RegNo = X86::DR5; break; + case '6': RegNo = X86::DR6; break; + case '7': RegNo = X86::DR7; break; + } + + if (RegNo != 0) { + EndLoc = Tok.getLoc(); + Parser.Lex(); // Eat it. + return false; + } + } + + if (RegNo == 0) + return Error(Tok.getLoc(), "invalid register name"); + + EndLoc = Tok.getLoc(); + Parser.Lex(); // Eat identifier token. + return false; +} + +X86Operand *X86ATTAsmParser::ParseOperand() { + switch (getLexer().getKind()) { + default: + // Parse a memory operand with no segment register. + return ParseMemOperand(0, Parser.getTok().getLoc()); + case AsmToken::Percent: { + // Read the register. + unsigned RegNo; + SMLoc Start, End; + if (ParseRegister(RegNo, Start, End)) return 0; + if (RegNo == X86::EIZ || RegNo == X86::RIZ) { + Error(Start, "%eiz and %riz can only be used as index registers"); + return 0; + } + + // If this is a segment register followed by a ':', then this is the start + // of a memory reference, otherwise this is a normal register reference. + if (getLexer().isNot(AsmToken::Colon)) + return X86Operand::CreateReg(RegNo, Start, End); + + + getParser().Lex(); // Eat the colon. + return ParseMemOperand(RegNo, Start); + } + case AsmToken::Dollar: { + // $42 -> immediate. + SMLoc Start = Parser.getTok().getLoc(), End; + Parser.Lex(); + const MCExpr *Val; + if (getParser().ParseExpression(Val, End)) + return 0; + return X86Operand::CreateImm(Val, Start, End); + } + } +} + +/// ParseMemOperand: segment: disp(basereg, indexreg, scale). The '%ds:' prefix +/// has already been parsed if present. +X86Operand *X86ATTAsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { + + // We have to disambiguate a parenthesized expression "(4+5)" from the start + // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)". The + // only way to do this without lookahead is to eat the '(' and see what is + // after it. + const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext()); + if (getLexer().isNot(AsmToken::LParen)) { + SMLoc ExprEnd; + if (getParser().ParseExpression(Disp, ExprEnd)) return 0; + + // After parsing the base expression we could either have a parenthesized + // memory address or not. If not, return now. If so, eat the (. + if (getLexer().isNot(AsmToken::LParen)) { + // Unless we have a segment register, treat this as an immediate. + if (SegReg == 0) + return X86Operand::CreateMem(Disp, MemStart, ExprEnd); + return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd); + } + + // Eat the '('. + Parser.Lex(); + } else { + // Okay, we have a '('. We don't know if this is an expression or not, but + // so we have to eat the ( to see beyond it. + SMLoc LParenLoc = Parser.getTok().getLoc(); + Parser.Lex(); // Eat the '('. + + if (getLexer().is(AsmToken::Percent) || getLexer().is(AsmToken::Comma)) { + // Nothing to do here, fall into the code below with the '(' part of the + // memory operand consumed. + } else { + SMLoc ExprEnd; + + // It must be an parenthesized expression, parse it now. + if (getParser().ParseParenExpression(Disp, ExprEnd)) + return 0; + + // After parsing the base expression we could either have a parenthesized + // memory address or not. If not, return now. If so, eat the (. + if (getLexer().isNot(AsmToken::LParen)) { + // Unless we have a segment register, treat this as an immediate. + if (SegReg == 0) + return X86Operand::CreateMem(Disp, LParenLoc, ExprEnd); + return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd); + } + + // Eat the '('. + Parser.Lex(); + } + } + + // If we reached here, then we just ate the ( of the memory operand. Process + // the rest of the memory operand. + unsigned BaseReg = 0, IndexReg = 0, Scale = 1; + + if (getLexer().is(AsmToken::Percent)) { + SMLoc L; + if (ParseRegister(BaseReg, L, L)) return 0; + if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) { + Error(L, "eiz and riz can only be used as index registers"); + return 0; + } + } + + if (getLexer().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma. + + // Following the comma we should have either an index register, or a scale + // value. We don't support the later form, but we want to parse it + // correctly. + // + // Not that even though it would be completely consistent to support syntax + // like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this. + if (getLexer().is(AsmToken::Percent)) { + SMLoc L; + if (ParseRegister(IndexReg, L, L)) return 0; + + if (getLexer().isNot(AsmToken::RParen)) { + // Parse the scale amount: + // ::= ',' [scale-expression] + if (getLexer().isNot(AsmToken::Comma)) { + Error(Parser.getTok().getLoc(), + "expected comma in scale expression"); + return 0; + } + Parser.Lex(); // Eat the comma. + + if (getLexer().isNot(AsmToken::RParen)) { + SMLoc Loc = Parser.getTok().getLoc(); + + int64_t ScaleVal; + if (getParser().ParseAbsoluteExpression(ScaleVal)) + return 0; + + // Validate the scale amount. + if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){ + Error(Loc, "scale factor in address must be 1, 2, 4 or 8"); + return 0; + } + Scale = (unsigned)ScaleVal; + } + } + } else if (getLexer().isNot(AsmToken::RParen)) { + // A scale amount without an index is ignored. + // index. + SMLoc Loc = Parser.getTok().getLoc(); + + int64_t Value; + if (getParser().ParseAbsoluteExpression(Value)) + return 0; + + if (Value != 1) + Warning(Loc, "scale factor without index register is ignored"); + Scale = 1; + } + } + + // Ok, we've eaten the memory operand, verify we have a ')' and eat it too. + if (getLexer().isNot(AsmToken::RParen)) { + Error(Parser.getTok().getLoc(), "unexpected token in memory operand"); + return 0; + } + SMLoc MemEnd = Parser.getTok().getLoc(); + Parser.Lex(); // Eat the ')'. + + return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, + MemStart, MemEnd); +} + +bool X86ATTAsmParser:: +ParseInstruction(StringRef Name, SMLoc NameLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands) { + StringRef PatchedName = Name; + + // FIXME: Hack to recognize setneb as setne. + if (PatchedName.startswith("set") && PatchedName.endswith("b") && + PatchedName != "setb" && PatchedName != "setnb") + PatchedName = PatchedName.substr(0, Name.size()-1); + + // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}. + const MCExpr *ExtraImmOp = 0; + if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) && + (PatchedName.endswith("ss") || PatchedName.endswith("sd") || + PatchedName.endswith("ps") || PatchedName.endswith("pd"))) { + bool IsVCMP = PatchedName.startswith("vcmp"); + unsigned SSECCIdx = IsVCMP ? 4 : 3; + unsigned SSEComparisonCode = StringSwitch<unsigned>( + PatchedName.slice(SSECCIdx, PatchedName.size() - 2)) + .Case("eq", 0) + .Case("lt", 1) + .Case("le", 2) + .Case("unord", 3) + .Case("neq", 4) + .Case("nlt", 5) + .Case("nle", 6) + .Case("ord", 7) + .Case("eq_uq", 8) + .Case("nge", 9) + .Case("ngt", 0x0A) + .Case("false", 0x0B) + .Case("neq_oq", 0x0C) + .Case("ge", 0x0D) + .Case("gt", 0x0E) + .Case("true", 0x0F) + .Case("eq_os", 0x10) + .Case("lt_oq", 0x11) + .Case("le_oq", 0x12) + .Case("unord_s", 0x13) + .Case("neq_us", 0x14) + .Case("nlt_uq", 0x15) + .Case("nle_uq", 0x16) + .Case("ord_s", 0x17) + .Case("eq_us", 0x18) + .Case("nge_uq", 0x19) + .Case("ngt_uq", 0x1A) + .Case("false_os", 0x1B) + .Case("neq_os", 0x1C) + .Case("ge_oq", 0x1D) + .Case("gt_oq", 0x1E) + .Case("true_us", 0x1F) + .Default(~0U); + if (SSEComparisonCode != ~0U) { + ExtraImmOp = MCConstantExpr::Create(SSEComparisonCode, + getParser().getContext()); + if (PatchedName.endswith("ss")) { + PatchedName = IsVCMP ? "vcmpss" : "cmpss"; + } else if (PatchedName.endswith("sd")) { + PatchedName = IsVCMP ? "vcmpsd" : "cmpsd"; + } else if (PatchedName.endswith("ps")) { + PatchedName = IsVCMP ? "vcmpps" : "cmpps"; + } else { + assert(PatchedName.endswith("pd") && "Unexpected mnemonic!"); + PatchedName = IsVCMP ? "vcmppd" : "cmppd"; + } + } + } + + Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc)); + + if (ExtraImmOp) + Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc)); + + + // Determine whether this is an instruction prefix. + bool isPrefix = + Name == "lock" || Name == "rep" || + Name == "repe" || Name == "repz" || + Name == "repne" || Name == "repnz" || + Name == "rex64" || Name == "data16"; + + + // This does the actual operand parsing. Don't parse any more if we have a + // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we + // just want to parse the "lock" as the first instruction and the "incl" as + // the next one. + if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) { + + // Parse '*' modifier. + if (getLexer().is(AsmToken::Star)) { + SMLoc Loc = Parser.getTok().getLoc(); + Operands.push_back(X86Operand::CreateToken("*", Loc)); + Parser.Lex(); // Eat the star. + } + + // Read the first operand. + if (X86Operand *Op = ParseOperand()) + Operands.push_back(Op); + else { + Parser.EatToEndOfStatement(); + return true; + } + + while (getLexer().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma. + + // Parse and remember the operand. + if (X86Operand *Op = ParseOperand()) + Operands.push_back(Op); + else { + Parser.EatToEndOfStatement(); + return true; + } + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + SMLoc Loc = getLexer().getLoc(); + Parser.EatToEndOfStatement(); + return Error(Loc, "unexpected token in argument list"); + } + } + + if (getLexer().is(AsmToken::EndOfStatement)) + Parser.Lex(); // Consume the EndOfStatement + else if (isPrefix && getLexer().is(AsmToken::Slash)) + Parser.Lex(); // Consume the prefix separator Slash + + // This is a terrible hack to handle "out[bwl]? %al, (%dx)" -> + // "outb %al, %dx". Out doesn't take a memory form, but this is a widely + // documented form in various unofficial manuals, so a lot of code uses it. + if ((Name == "outb" || Name == "outw" || Name == "outl" || Name == "out") && + Operands.size() == 3) { + X86Operand &Op = *(X86Operand*)Operands.back(); + if (Op.isMem() && Op.Mem.SegReg == 0 && + isa<MCConstantExpr>(Op.Mem.Disp) && + cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && + Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) { + SMLoc Loc = Op.getEndLoc(); + Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc); + delete &Op; + } + } + // Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al". + if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") && + Operands.size() == 3) { + X86Operand &Op = *(X86Operand*)Operands.begin()[1]; + if (Op.isMem() && Op.Mem.SegReg == 0 && + isa<MCConstantExpr>(Op.Mem.Disp) && + cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 && + Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) { + SMLoc Loc = Op.getEndLoc(); + Operands.begin()[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc); + delete &Op; + } + } + // Transform "ins[bwl] %dx, %es:(%edi)" into "ins[bwl]" + if (Name.startswith("ins") && Operands.size() == 3 && + (Name == "insb" || Name == "insw" || Name == "insl")) { + X86Operand &Op = *(X86Operand*)Operands.begin()[1]; + X86Operand &Op2 = *(X86Operand*)Operands.begin()[2]; + if (Op.isReg() && Op.getReg() == X86::DX && isDstOp(Op2)) { + Operands.pop_back(); + Operands.pop_back(); + delete &Op; + delete &Op2; + } + } + + // Transform "outs[bwl] %ds:(%esi), %dx" into "out[bwl]" + if (Name.startswith("outs") && Operands.size() == 3 && + (Name == "outsb" || Name == "outsw" || Name == "outsl")) { + X86Operand &Op = *(X86Operand*)Operands.begin()[1]; + X86Operand &Op2 = *(X86Operand*)Operands.begin()[2]; + if (isSrcOp(Op) && Op2.isReg() && Op2.getReg() == X86::DX) { + Operands.pop_back(); + Operands.pop_back(); + delete &Op; + delete &Op2; + } + } + + // Transform "movs[bwl] %ds:(%esi), %es:(%edi)" into "movs[bwl]" + if (Name.startswith("movs") && Operands.size() == 3 && + (Name == "movsb" || Name == "movsw" || Name == "movsl" || + (is64BitMode() && Name == "movsq"))) { + X86Operand &Op = *(X86Operand*)Operands.begin()[1]; + X86Operand &Op2 = *(X86Operand*)Operands.begin()[2]; + if (isSrcOp(Op) && isDstOp(Op2)) { + Operands.pop_back(); + Operands.pop_back(); + delete &Op; + delete &Op2; + } + } + // Transform "lods[bwl] %ds:(%esi),{%al,%ax,%eax,%rax}" into "lods[bwl]" + if (Name.startswith("lods") && Operands.size() == 3 && + (Name == "lods" || Name == "lodsb" || Name == "lodsw" || + Name == "lodsl" || (is64BitMode() && Name == "lodsq"))) { + X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); + X86Operand *Op2 = static_cast<X86Operand*>(Operands[2]); + if (isSrcOp(*Op1) && Op2->isReg()) { + const char *ins; + unsigned reg = Op2->getReg(); + bool isLods = Name == "lods"; + if (reg == X86::AL && (isLods || Name == "lodsb")) + ins = "lodsb"; + else if (reg == X86::AX && (isLods || Name == "lodsw")) + ins = "lodsw"; + else if (reg == X86::EAX && (isLods || Name == "lodsl")) + ins = "lodsl"; + else if (reg == X86::RAX && (isLods || Name == "lodsq")) + ins = "lodsq"; + else + ins = NULL; + if (ins != NULL) { + Operands.pop_back(); + Operands.pop_back(); + delete Op1; + delete Op2; + if (Name != ins) + static_cast<X86Operand*>(Operands[0])->setTokenValue(ins); + } + } + } + // Transform "stos[bwl] {%al,%ax,%eax,%rax},%es:(%edi)" into "stos[bwl]" + if (Name.startswith("stos") && Operands.size() == 3 && + (Name == "stos" || Name == "stosb" || Name == "stosw" || + Name == "stosl" || (is64BitMode() && Name == "stosq"))) { + X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); + X86Operand *Op2 = static_cast<X86Operand*>(Operands[2]); + if (isDstOp(*Op2) && Op1->isReg()) { + const char *ins; + unsigned reg = Op1->getReg(); + bool isStos = Name == "stos"; + if (reg == X86::AL && (isStos || Name == "stosb")) + ins = "stosb"; + else if (reg == X86::AX && (isStos || Name == "stosw")) + ins = "stosw"; + else if (reg == X86::EAX && (isStos || Name == "stosl")) + ins = "stosl"; + else if (reg == X86::RAX && (isStos || Name == "stosq")) + ins = "stosq"; + else + ins = NULL; + if (ins != NULL) { + Operands.pop_back(); + Operands.pop_back(); + delete Op1; + delete Op2; + if (Name != ins) + static_cast<X86Operand*>(Operands[0])->setTokenValue(ins); + } + } + } + + // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>. Canonicalize to + // "shift <op>". + if ((Name.startswith("shr") || Name.startswith("sar") || + Name.startswith("shl") || Name.startswith("sal") || + Name.startswith("rcl") || Name.startswith("rcr") || + Name.startswith("rol") || Name.startswith("ror")) && + Operands.size() == 3) { + X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); + if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && + cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) { + delete Operands[1]; + Operands.erase(Operands.begin() + 1); + } + } + + // Transforms "int $3" into "int3" as a size optimization. We can't write an + // instalias with an immediate operand yet. + if (Name == "int" && Operands.size() == 2) { + X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]); + if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) && + cast<MCConstantExpr>(Op1->getImm())->getValue() == 3) { + delete Operands[1]; + Operands.erase(Operands.begin() + 1); + static_cast<X86Operand*>(Operands[0])->setTokenValue("int3"); + } + } + + return false; +} + +bool X86ATTAsmParser:: +MatchAndEmitInstruction(SMLoc IDLoc, + SmallVectorImpl<MCParsedAsmOperand*> &Operands, + MCStreamer &Out) { + assert(!Operands.empty() && "Unexpect empty operand list!"); + X86Operand *Op = static_cast<X86Operand*>(Operands[0]); + assert(Op->isToken() && "Leading operand should always be a mnemonic!"); + + // First, handle aliases that expand to multiple instructions. + // FIXME: This should be replaced with a real .td file alias mechanism. + // Also, MatchInstructionImpl should do actually *do* the EmitInstruction + // call. + if (Op->getToken() == "fstsw" || Op->getToken() == "fstcw" || + Op->getToken() == "fstsww" || Op->getToken() == "fstcww" || + Op->getToken() == "finit" || Op->getToken() == "fsave" || + Op->getToken() == "fstenv" || Op->getToken() == "fclex") { + MCInst Inst; + Inst.setOpcode(X86::WAIT); + Out.EmitInstruction(Inst); + + const char *Repl = + StringSwitch<const char*>(Op->getToken()) + .Case("finit", "fninit") + .Case("fsave", "fnsave") + .Case("fstcw", "fnstcw") + .Case("fstcww", "fnstcw") + .Case("fstenv", "fnstenv") + .Case("fstsw", "fnstsw") + .Case("fstsww", "fnstsw") + .Case("fclex", "fnclex") + .Default(0); + assert(Repl && "Unknown wait-prefixed instruction"); + delete Operands[0]; + Operands[0] = X86Operand::CreateToken(Repl, IDLoc); + } + + bool WasOriginallyInvalidOperand = false; + unsigned OrigErrorInfo; + MCInst Inst; + + // First, try a direct match. + switch (MatchInstructionImpl(Operands, Inst, OrigErrorInfo)) { + default: break; + case Match_Success: + Out.EmitInstruction(Inst); + return false; + case Match_MissingFeature: + Error(IDLoc, "instruction requires a CPU feature not currently enabled"); + return true; + case Match_ConversionFail: + return Error(IDLoc, "unable to convert operands to instruction"); + case Match_InvalidOperand: + WasOriginallyInvalidOperand = true; + break; + case Match_MnemonicFail: + break; + } + + // FIXME: Ideally, we would only attempt suffix matches for things which are + // valid prefixes, and we could just infer the right unambiguous + // type. However, that requires substantially more matcher support than the + // following hack. + + // Change the operand to point to a temporary token. + StringRef Base = Op->getToken(); + SmallString<16> Tmp; + Tmp += Base; + Tmp += ' '; + Op->setTokenValue(Tmp.str()); + + // If this instruction starts with an 'f', then it is a floating point stack + // instruction. These come in up to three forms for 32-bit, 64-bit, and + // 80-bit floating point, which use the suffixes s,l,t respectively. + // + // Otherwise, we assume that this may be an integer instruction, which comes + // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively. + const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0"; + + // Check for the various suffix matches. + Tmp[Base.size()] = Suffixes[0]; + unsigned ErrorInfoIgnore; + unsigned Match1, Match2, Match3, Match4; + + Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore); + Tmp[Base.size()] = Suffixes[1]; + Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore); + Tmp[Base.size()] = Suffixes[2]; + Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore); + Tmp[Base.size()] = Suffixes[3]; + Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore); + + // Restore the old token. + Op->setTokenValue(Base); + + // If exactly one matched, then we treat that as a successful match (and the + // instruction will already have been filled in correctly, since the failing + // matches won't have modified it). + unsigned NumSuccessfulMatches = + (Match1 == Match_Success) + (Match2 == Match_Success) + + (Match3 == Match_Success) + (Match4 == Match_Success); + if (NumSuccessfulMatches == 1) { + Out.EmitInstruction(Inst); + return false; + } + + // Otherwise, the match failed, try to produce a decent error message. + + // If we had multiple suffix matches, then identify this as an ambiguous + // match. + if (NumSuccessfulMatches > 1) { + char MatchChars[4]; + unsigned NumMatches = 0; + if (Match1 == Match_Success) MatchChars[NumMatches++] = Suffixes[0]; + if (Match2 == Match_Success) MatchChars[NumMatches++] = Suffixes[1]; + if (Match3 == Match_Success) MatchChars[NumMatches++] = Suffixes[2]; + if (Match4 == Match_Success) MatchChars[NumMatches++] = Suffixes[3]; + + SmallString<126> Msg; + raw_svector_ostream OS(Msg); + OS << "ambiguous instructions require an explicit suffix (could be "; + for (unsigned i = 0; i != NumMatches; ++i) { + if (i != 0) + OS << ", "; + if (i + 1 == NumMatches) + OS << "or "; + OS << "'" << Base << MatchChars[i] << "'"; + } + OS << ")"; + Error(IDLoc, OS.str()); + return true; + } + + // Okay, we know that none of the variants matched successfully. + + // If all of the instructions reported an invalid mnemonic, then the original + // mnemonic was invalid. + if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) && + (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) { + if (!WasOriginallyInvalidOperand) { + Error(IDLoc, "invalid instruction mnemonic '" + Base + "'"); + return true; + } + + // Recover location info for the operand if we know which was the problem. + SMLoc ErrorLoc = IDLoc; + if (OrigErrorInfo != ~0U) { + if (OrigErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction"); + + ErrorLoc = ((X86Operand*)Operands[OrigErrorInfo])->getStartLoc(); + if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; + } + + return Error(ErrorLoc, "invalid operand for instruction"); + } + + // If one instruction matched with a missing feature, report this as a + // missing feature. + if ((Match1 == Match_MissingFeature) + (Match2 == Match_MissingFeature) + + (Match3 == Match_MissingFeature) + (Match4 == Match_MissingFeature) == 1){ + Error(IDLoc, "instruction requires a CPU feature not currently enabled"); + return true; + } + + // If one instruction matched with an invalid operand, report this as an + // operand failure. + if ((Match1 == Match_InvalidOperand) + (Match2 == Match_InvalidOperand) + + (Match3 == Match_InvalidOperand) + (Match4 == Match_InvalidOperand) == 1){ + Error(IDLoc, "invalid operand for instruction"); + return true; + } + + // If all of these were an outright failure, report it in a useless way. + // FIXME: We should give nicer diagnostics about the exact failure. + Error(IDLoc, "unknown use of instruction mnemonic without a size suffix"); + return true; +} + + +bool X86ATTAsmParser::ParseDirective(AsmToken DirectiveID) { + StringRef IDVal = DirectiveID.getIdentifier(); + if (IDVal == ".word") + return ParseDirectiveWord(2, DirectiveID.getLoc()); + else if (IDVal.startswith(".code")) + return ParseDirectiveCode(IDVal, DirectiveID.getLoc()); + return true; +} + +/// ParseDirectiveWord +/// ::= .word [ expression (, expression)* ] +bool X86ATTAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { + if (getLexer().isNot(AsmToken::EndOfStatement)) { + for (;;) { + const MCExpr *Value; + if (getParser().ParseExpression(Value)) + return true; + + getParser().getStreamer().EmitValue(Value, Size, 0 /*addrspace*/); + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + // FIXME: Improve diagnostic. + if (getLexer().isNot(AsmToken::Comma)) + return Error(L, "unexpected token in directive"); + Parser.Lex(); + } + } + + Parser.Lex(); + return false; +} + +/// ParseDirectiveCode +/// ::= .code32 | .code64 +bool X86ATTAsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { + if (IDVal == ".code32") { + Parser.Lex(); + if (is64BitMode()) { + SwitchMode(); + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32); + } + } else if (IDVal == ".code64") { + Parser.Lex(); + if (!is64BitMode()) { + SwitchMode(); + getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64); + } + } else { + return Error(L, "unexpected directive " + IDVal); + } + + return false; +} + + +extern "C" void LLVMInitializeX86AsmLexer(); + +// Force static initialization. +extern "C" void LLVMInitializeX86AsmParser() { + RegisterMCAsmParser<X86ATTAsmParser> X(TheX86_32Target); + RegisterMCAsmParser<X86ATTAsmParser> Y(TheX86_64Target); + LLVMInitializeX86AsmLexer(); +} + +#define GET_REGISTER_MATCHER +#define GET_MATCHER_IMPLEMENTATION +#include "X86GenAsmMatcher.inc" diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp new file mode 100644 index 0000000..3aacb20 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -0,0 +1,607 @@ +//===- X86Disassembler.cpp - Disassembler for x86 and x86_64 ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is part of the X86 Disassembler. +// It contains code to translate the data produced by the decoder into +// MCInsts. +// Documentation for the disassembler can be found in X86Disassembler.h. +// +//===----------------------------------------------------------------------===// + +#include "X86Disassembler.h" +#include "X86DisassemblerDecoder.h" + +#include "llvm/MC/EDInstInfo.h" +#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MemoryObject.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" + +#define GET_REGINFO_ENUM +#include "X86GenRegisterInfo.inc" +#define GET_INSTRINFO_ENUM +#include "X86GenInstrInfo.inc" +#include "X86GenEDInfo.inc" + +using namespace llvm; +using namespace llvm::X86Disassembler; + +void x86DisassemblerDebug(const char *file, + unsigned line, + const char *s) { + dbgs() << file << ":" << line << ": " << s; +} + +#define debug(s) DEBUG(x86DisassemblerDebug(__FILE__, __LINE__, s)); + +namespace llvm { + +// Fill-ins to make the compiler happy. These constants are never actually +// assigned; they are just filler to make an automatically-generated switch +// statement work. +namespace X86 { + enum { + BX_SI = 500, + BX_DI = 501, + BP_SI = 502, + BP_DI = 503, + sib = 504, + sib64 = 505 + }; +} + +extern Target TheX86_32Target, TheX86_64Target; + +} + +static bool translateInstruction(MCInst &target, + InternalInstruction &source); + +X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode) : + MCDisassembler(STI), + fMode(mode) { +} + +X86GenericDisassembler::~X86GenericDisassembler() { +} + +EDInstInfo *X86GenericDisassembler::getEDInfo() const { + return instInfoX86; +} + +/// regionReader - a callback function that wraps the readByte method from +/// MemoryObject. +/// +/// @param arg - The generic callback parameter. In this case, this should +/// be a pointer to a MemoryObject. +/// @param byte - A pointer to the byte to be read. +/// @param address - The address to be read. +static int regionReader(void* arg, uint8_t* byte, uint64_t address) { + MemoryObject* region = static_cast<MemoryObject*>(arg); + return region->readByte(address, byte); +} + +/// logger - a callback function that wraps the operator<< method from +/// raw_ostream. +/// +/// @param arg - The generic callback parameter. This should be a pointe +/// to a raw_ostream. +/// @param log - A string to be logged. logger() adds a newline. +static void logger(void* arg, const char* log) { + if (!arg) + return; + + raw_ostream &vStream = *(static_cast<raw_ostream*>(arg)); + vStream << log << "\n"; +} + +// +// Public interface for the disassembler +// + +MCDisassembler::DecodeStatus +X86GenericDisassembler::getInstruction(MCInst &instr, + uint64_t &size, + const MemoryObject ®ion, + uint64_t address, + raw_ostream &vStream, + raw_ostream &cStream) const { + InternalInstruction internalInstr; + + dlog_t loggerFn = logger; + if (&vStream == &nulls()) + loggerFn = 0; // Disable logging completely if it's going to nulls(). + + int ret = decodeInstruction(&internalInstr, + regionReader, + (void*)®ion, + loggerFn, + (void*)&vStream, + address, + fMode); + + if (ret) { + size = internalInstr.readerCursor - address; + return Fail; + } + else { + size = internalInstr.length; + return (!translateInstruction(instr, internalInstr)) ? Success : Fail; + } +} + +// +// Private code that translates from struct InternalInstructions to MCInsts. +// + +/// translateRegister - Translates an internal register to the appropriate LLVM +/// register, and appends it as an operand to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param reg - The Reg to append. +static void translateRegister(MCInst &mcInst, Reg reg) { +#define ENTRY(x) X86::x, + uint8_t llvmRegnums[] = { + ALL_REGS + 0 + }; +#undef ENTRY + + uint8_t llvmRegnum = llvmRegnums[reg]; + mcInst.addOperand(MCOperand::CreateReg(llvmRegnum)); +} + +/// translateImmediate - Appends an immediate operand to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param immediate - The immediate value to append. +/// @param operand - The operand, as stored in the descriptor table. +/// @param insn - The internal instruction. +static void translateImmediate(MCInst &mcInst, uint64_t immediate, + const OperandSpecifier &operand, + InternalInstruction &insn) { + // Sign-extend the immediate if necessary. + + OperandType type = operand.type; + + if (type == TYPE_RELv) { + switch (insn.displacementSize) { + default: + break; + case 1: + type = TYPE_MOFFS8; + break; + case 2: + type = TYPE_MOFFS16; + break; + case 4: + type = TYPE_MOFFS32; + break; + case 8: + type = TYPE_MOFFS64; + break; + } + } + // By default sign-extend all X86 immediates based on their encoding. + else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 || + type == TYPE_IMM64) { + uint32_t Opcode = mcInst.getOpcode(); + switch (operand.encoding) { + default: + break; + case ENCODING_IB: + // Special case those X86 instructions that use the imm8 as a set of + // bits, bit count, etc. and are not sign-extend. + if (Opcode != X86::BLENDPSrri && Opcode != X86::BLENDPDrri && + Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri && + Opcode != X86::DPPSrri && Opcode != X86::DPPDrri && + Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri && + Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri && + Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri && + Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri && + Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri && + Opcode != X86::VINSERTPSrr) + type = TYPE_MOFFS8; + break; + case ENCODING_IW: + type = TYPE_MOFFS16; + break; + case ENCODING_ID: + type = TYPE_MOFFS32; + break; + case ENCODING_IO: + type = TYPE_MOFFS64; + break; + } + } + + switch (type) { + case TYPE_XMM128: + mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4))); + return; + case TYPE_XMM256: + mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4))); + return; + case TYPE_MOFFS8: + case TYPE_REL8: + if(immediate & 0x80) + immediate |= ~(0xffull); + break; + case TYPE_MOFFS16: + if(immediate & 0x8000) + immediate |= ~(0xffffull); + break; + case TYPE_MOFFS32: + case TYPE_REL32: + case TYPE_REL64: + if(immediate & 0x80000000) + immediate |= ~(0xffffffffull); + break; + case TYPE_MOFFS64: + default: + // operand is 64 bits wide. Do nothing. + break; + } + + mcInst.addOperand(MCOperand::CreateImm(immediate)); +} + +/// translateRMRegister - Translates a register stored in the R/M field of the +/// ModR/M byte to its LLVM equivalent and appends it to an MCInst. +/// @param mcInst - The MCInst to append to. +/// @param insn - The internal instruction to extract the R/M field +/// from. +/// @return - 0 on success; -1 otherwise +static bool translateRMRegister(MCInst &mcInst, + InternalInstruction &insn) { + if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) { + debug("A R/M register operand may not have a SIB byte"); + return true; + } + + switch (insn.eaBase) { + default: + debug("Unexpected EA base register"); + return true; + case EA_BASE_NONE: + debug("EA_BASE_NONE for ModR/M base"); + return true; +#define ENTRY(x) case EA_BASE_##x: + ALL_EA_BASES +#undef ENTRY + debug("A R/M register operand may not have a base; " + "the operand must be a register."); + return true; +#define ENTRY(x) \ + case EA_REG_##x: \ + mcInst.addOperand(MCOperand::CreateReg(X86::x)); break; + ALL_REGS +#undef ENTRY + } + + return false; +} + +/// translateRMMemory - Translates a memory operand stored in the Mod and R/M +/// fields of an internal instruction (and possibly its SIB byte) to a memory +/// operand in LLVM's format, and appends it to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param insn - The instruction to extract Mod, R/M, and SIB fields +/// from. +/// @return - 0 on success; nonzero otherwise +static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn) { + // Addresses in an MCInst are represented as five operands: + // 1. basereg (register) The R/M base, or (if there is a SIB) the + // SIB base + // 2. scaleamount (immediate) 1, or (if there is a SIB) the specified + // scale amount + // 3. indexreg (register) x86_registerNONE, or (if there is a SIB) + // the index (which is multiplied by the + // scale amount) + // 4. displacement (immediate) 0, or the displacement if there is one + // 5. segmentreg (register) x86_registerNONE for now, but could be set + // if we have segment overrides + + MCOperand baseReg; + MCOperand scaleAmount; + MCOperand indexReg; + MCOperand displacement; + MCOperand segmentReg; + + if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) { + if (insn.sibBase != SIB_BASE_NONE) { + switch (insn.sibBase) { + default: + debug("Unexpected sibBase"); + return true; +#define ENTRY(x) \ + case SIB_BASE_##x: \ + baseReg = MCOperand::CreateReg(X86::x); break; + ALL_SIB_BASES +#undef ENTRY + } + } else { + baseReg = MCOperand::CreateReg(0); + } + + if (insn.sibIndex != SIB_INDEX_NONE) { + switch (insn.sibIndex) { + default: + debug("Unexpected sibIndex"); + return true; +#define ENTRY(x) \ + case SIB_INDEX_##x: \ + indexReg = MCOperand::CreateReg(X86::x); break; + EA_BASES_32BIT + EA_BASES_64BIT +#undef ENTRY + } + } else { + indexReg = MCOperand::CreateReg(0); + } + + scaleAmount = MCOperand::CreateImm(insn.sibScale); + } else { + switch (insn.eaBase) { + case EA_BASE_NONE: + if (insn.eaDisplacement == EA_DISP_NONE) { + debug("EA_BASE_NONE and EA_DISP_NONE for ModR/M base"); + return true; + } + if (insn.mode == MODE_64BIT) + baseReg = MCOperand::CreateReg(X86::RIP); // Section 2.2.1.6 + else + baseReg = MCOperand::CreateReg(0); + + indexReg = MCOperand::CreateReg(0); + break; + case EA_BASE_BX_SI: + baseReg = MCOperand::CreateReg(X86::BX); + indexReg = MCOperand::CreateReg(X86::SI); + break; + case EA_BASE_BX_DI: + baseReg = MCOperand::CreateReg(X86::BX); + indexReg = MCOperand::CreateReg(X86::DI); + break; + case EA_BASE_BP_SI: + baseReg = MCOperand::CreateReg(X86::BP); + indexReg = MCOperand::CreateReg(X86::SI); + break; + case EA_BASE_BP_DI: + baseReg = MCOperand::CreateReg(X86::BP); + indexReg = MCOperand::CreateReg(X86::DI); + break; + default: + indexReg = MCOperand::CreateReg(0); + switch (insn.eaBase) { + default: + debug("Unexpected eaBase"); + return true; + // Here, we will use the fill-ins defined above. However, + // BX_SI, BX_DI, BP_SI, and BP_DI are all handled above and + // sib and sib64 were handled in the top-level if, so they're only + // placeholders to keep the compiler happy. +#define ENTRY(x) \ + case EA_BASE_##x: \ + baseReg = MCOperand::CreateReg(X86::x); break; + ALL_EA_BASES +#undef ENTRY +#define ENTRY(x) case EA_REG_##x: + ALL_REGS +#undef ENTRY + debug("A R/M memory operand may not be a register; " + "the base field must be a base."); + return true; + } + } + + scaleAmount = MCOperand::CreateImm(1); + } + + displacement = MCOperand::CreateImm(insn.displacement); + + static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = { + 0, // SEG_OVERRIDE_NONE + X86::CS, + X86::SS, + X86::DS, + X86::ES, + X86::FS, + X86::GS + }; + + segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]); + + mcInst.addOperand(baseReg); + mcInst.addOperand(scaleAmount); + mcInst.addOperand(indexReg); + mcInst.addOperand(displacement); + mcInst.addOperand(segmentReg); + return false; +} + +/// translateRM - Translates an operand stored in the R/M (and possibly SIB) +/// byte of an instruction to LLVM form, and appends it to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param operand - The operand, as stored in the descriptor table. +/// @param insn - The instruction to extract Mod, R/M, and SIB fields +/// from. +/// @return - 0 on success; nonzero otherwise +static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, + InternalInstruction &insn) { + switch (operand.type) { + default: + debug("Unexpected type for a R/M operand"); + return true; + case TYPE_R8: + case TYPE_R16: + case TYPE_R32: + case TYPE_R64: + case TYPE_Rv: + case TYPE_MM: + case TYPE_MM32: + case TYPE_MM64: + case TYPE_XMM: + case TYPE_XMM32: + case TYPE_XMM64: + case TYPE_XMM128: + case TYPE_XMM256: + case TYPE_DEBUGREG: + case TYPE_CONTROLREG: + return translateRMRegister(mcInst, insn); + case TYPE_M: + case TYPE_M8: + case TYPE_M16: + case TYPE_M32: + case TYPE_M64: + case TYPE_M128: + case TYPE_M256: + case TYPE_M512: + case TYPE_Mv: + case TYPE_M32FP: + case TYPE_M64FP: + case TYPE_M80FP: + case TYPE_M16INT: + case TYPE_M32INT: + case TYPE_M64INT: + case TYPE_M1616: + case TYPE_M1632: + case TYPE_M1664: + case TYPE_LEA: + return translateRMMemory(mcInst, insn); + } +} + +/// translateFPRegister - Translates a stack position on the FPU stack to its +/// LLVM form, and appends it to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param stackPos - The stack position to translate. +/// @return - 0 on success; nonzero otherwise. +static bool translateFPRegister(MCInst &mcInst, + uint8_t stackPos) { + if (stackPos >= 8) { + debug("Invalid FP stack position"); + return true; + } + + mcInst.addOperand(MCOperand::CreateReg(X86::ST0 + stackPos)); + + return false; +} + +/// translateOperand - Translates an operand stored in an internal instruction +/// to LLVM's format and appends it to an MCInst. +/// +/// @param mcInst - The MCInst to append to. +/// @param operand - The operand, as stored in the descriptor table. +/// @param insn - The internal instruction. +/// @return - false on success; true otherwise. +static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, + InternalInstruction &insn) { + switch (operand.encoding) { + default: + debug("Unhandled operand encoding during translation"); + return true; + case ENCODING_REG: + translateRegister(mcInst, insn.reg); + return false; + case ENCODING_RM: + return translateRM(mcInst, operand, insn); + case ENCODING_CB: + case ENCODING_CW: + case ENCODING_CD: + case ENCODING_CP: + case ENCODING_CO: + case ENCODING_CT: + debug("Translation of code offsets isn't supported."); + return true; + case ENCODING_IB: + case ENCODING_IW: + case ENCODING_ID: + case ENCODING_IO: + case ENCODING_Iv: + case ENCODING_Ia: + translateImmediate(mcInst, + insn.immediates[insn.numImmediatesTranslated++], + operand, + insn); + return false; + case ENCODING_RB: + case ENCODING_RW: + case ENCODING_RD: + case ENCODING_RO: + translateRegister(mcInst, insn.opcodeRegister); + return false; + case ENCODING_I: + return translateFPRegister(mcInst, insn.opcodeModifier); + case ENCODING_Rv: + translateRegister(mcInst, insn.opcodeRegister); + return false; + case ENCODING_VVVV: + translateRegister(mcInst, insn.vvvv); + return false; + case ENCODING_DUP: + return translateOperand(mcInst, + insn.spec->operands[operand.type - TYPE_DUP0], + insn); + } +} + +/// translateInstruction - Translates an internal instruction and all its +/// operands to an MCInst. +/// +/// @param mcInst - The MCInst to populate with the instruction's data. +/// @param insn - The internal instruction. +/// @return - false on success; true otherwise. +static bool translateInstruction(MCInst &mcInst, + InternalInstruction &insn) { + if (!insn.spec) { + debug("Instruction has no specification"); + return true; + } + + mcInst.setOpcode(insn.instructionID); + + int index; + + insn.numImmediatesTranslated = 0; + + for (index = 0; index < X86_MAX_OPERANDS; ++index) { + if (insn.spec->operands[index].encoding != ENCODING_NONE) { + if (translateOperand(mcInst, insn.spec->operands[index], insn)) { + return true; + } + } + } + + return false; +} + +static MCDisassembler *createX86_32Disassembler(const Target &T, const MCSubtargetInfo &STI) { + return new X86Disassembler::X86_32Disassembler(STI); +} + +static MCDisassembler *createX86_64Disassembler(const Target &T, const MCSubtargetInfo &STI) { + return new X86Disassembler::X86_64Disassembler(STI); +} + +extern "C" void LLVMInitializeX86Disassembler() { + // Register the disassembler. + TargetRegistry::RegisterMCDisassembler(TheX86_32Target, + createX86_32Disassembler); + TargetRegistry::RegisterMCDisassembler(TheX86_64Target, + createX86_64Disassembler); +} diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h new file mode 100644 index 0000000..6ac9a0f --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h @@ -0,0 +1,157 @@ +//===- X86Disassembler.h - Disassembler for x86 and x86_64 ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The X86 disassembler is a table-driven disassembler for the 16-, 32-, and +// 64-bit X86 instruction sets. The main decode sequence for an assembly +// instruction in this disassembler is: +// +// 1. Read the prefix bytes and determine the attributes of the instruction. +// These attributes, recorded in enum attributeBits +// (X86DisassemblerDecoderCommon.h), form a bitmask. The table CONTEXTS_SYM +// provides a mapping from bitmasks to contexts, which are represented by +// enum InstructionContext (ibid.). +// +// 2. Read the opcode, and determine what kind of opcode it is. The +// disassembler distinguishes four kinds of opcodes, which are enumerated in +// OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte +// (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a +// (0x0f 0x3a 0xnn). Mandatory prefixes are treated as part of the context. +// +// 3. Depending on the opcode type, look in one of four ClassDecision structures +// (X86DisassemblerDecoderCommon.h). Use the opcode class to determine which +// OpcodeDecision (ibid.) to look the opcode in. Look up the opcode, to get +// a ModRMDecision (ibid.). +// +// 4. Some instructions, such as escape opcodes or extended opcodes, or even +// instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the +// ModR/M byte to complete decode. The ModRMDecision's type is an entry from +// ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the +// ModR/M byte is required and how to interpret it. +// +// 5. After resolving the ModRMDecision, the disassembler has a unique ID +// of type InstrUID (X86DisassemblerDecoderCommon.h). Looking this ID up in +// INSTRUCTIONS_SYM yields the name of the instruction and the encodings and +// meanings of its operands. +// +// 6. For each operand, its encoding is an entry from OperandEncoding +// (X86DisassemblerDecoderCommon.h) and its type is an entry from +// OperandType (ibid.). The encoding indicates how to read it from the +// instruction; the type indicates how to interpret the value once it has +// been read. For example, a register operand could be stored in the R/M +// field of the ModR/M byte, the REG field of the ModR/M byte, or added to +// the main opcode. This is orthogonal from its meaning (an GPR or an XMM +// register, for instance). Given this information, the operands can be +// extracted and interpreted. +// +// 7. As the last step, the disassembler translates the instruction information +// and operands into a format understandable by the client - in this case, an +// MCInst for use by the MC infrastructure. +// +// The disassembler is broken broadly into two parts: the table emitter that +// emits the instruction decode tables discussed above during compilation, and +// the disassembler itself. The table emitter is documented in more detail in +// utils/TableGen/X86DisassemblerEmitter.h. +// +// X86Disassembler.h contains the public interface for the disassembler, +// adhering to the MCDisassembler interface. +// X86Disassembler.cpp contains the code responsible for step 7, and for +// invoking the decoder to execute steps 1-6. +// X86DisassemblerDecoderCommon.h contains the definitions needed by both the +// table emitter and the disassembler. +// X86DisassemblerDecoder.h contains the public interface of the decoder, +// factored out into C for possible use by other projects. +// X86DisassemblerDecoder.c contains the source code of the decoder, which is +// responsible for steps 1-6. +// +//===----------------------------------------------------------------------===// + +#ifndef X86DISASSEMBLER_H +#define X86DISASSEMBLER_H + +#define INSTRUCTION_SPECIFIER_FIELDS \ + const char* name; + +#define INSTRUCTION_IDS \ + const InstrUID *instructionIDs; + +#include "X86DisassemblerDecoderCommon.h" + +#undef INSTRUCTION_SPECIFIER_FIELDS +#undef INSTRUCTION_IDS + +#include "llvm/MC/MCDisassembler.h" + +struct InternalInstruction; + +namespace llvm { + +class MCInst; +class MCSubtargetInfo; +class MemoryObject; +class raw_ostream; + +struct EDInstInfo; + +namespace X86Disassembler { + +/// X86GenericDisassembler - Generic disassembler for all X86 platforms. +/// All each platform class should have to do is subclass the constructor, and +/// provide a different disassemblerMode value. +class X86GenericDisassembler : public MCDisassembler { +protected: + /// Constructor - Initializes the disassembler. + /// + /// @param mode - The X86 architecture mode to decode for. + X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode); +public: + ~X86GenericDisassembler(); + + /// getInstruction - See MCDisassembler. + DecodeStatus getInstruction(MCInst &instr, + uint64_t &size, + const MemoryObject ®ion, + uint64_t address, + raw_ostream &vStream, + raw_ostream &cStream) const; + + /// getEDInfo - See MCDisassembler. + EDInstInfo *getEDInfo() const; +private: + DisassemblerMode fMode; +}; + +/// X86_16Disassembler - 16-bit X86 disassembler. +class X86_16Disassembler : public X86GenericDisassembler { +public: + X86_16Disassembler(const MCSubtargetInfo &STI) : + X86GenericDisassembler(STI, MODE_16BIT) { + } +}; + +/// X86_16Disassembler - 32-bit X86 disassembler. +class X86_32Disassembler : public X86GenericDisassembler { +public: + X86_32Disassembler(const MCSubtargetInfo &STI) : + X86GenericDisassembler(STI, MODE_32BIT) { + } +}; + +/// X86_16Disassembler - 64-bit X86 disassembler. +class X86_64Disassembler : public X86GenericDisassembler { +public: + X86_64Disassembler(const MCSubtargetInfo &STI) : + X86GenericDisassembler(STI, MODE_64BIT) { + } +}; + +} // namespace X86Disassembler + +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c new file mode 100644 index 0000000..f9b0fe5 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c @@ -0,0 +1,1625 @@ +/*===- X86DisassemblerDecoder.c - Disassembler decoder -------------*- C -*-==* + * + * The LLVM Compiler Infrastructure + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + * + *===----------------------------------------------------------------------===* + * + * This file is part of the X86 Disassembler. + * It contains the implementation of the instruction decoder. + * Documentation for the disassembler can be found in X86Disassembler.h. + * + *===----------------------------------------------------------------------===*/ + +#include <stdarg.h> /* for va_*() */ +#include <stdio.h> /* for vsnprintf() */ +#include <stdlib.h> /* for exit() */ +#include <string.h> /* for memset() */ + +#include "X86DisassemblerDecoder.h" + +#include "X86GenDisassemblerTables.inc" + +#define TRUE 1 +#define FALSE 0 + +typedef int8_t bool; + +#ifndef NDEBUG +#define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0) +#else +#define debug(s) do { } while (0) +#endif + + +/* + * contextForAttrs - Client for the instruction context table. Takes a set of + * attributes and returns the appropriate decode context. + * + * @param attrMask - Attributes, from the enumeration attributeBits. + * @return - The InstructionContext to use when looking up an + * an instruction with these attributes. + */ +static InstructionContext contextForAttrs(uint8_t attrMask) { + return CONTEXTS_SYM[attrMask]; +} + +/* + * modRMRequired - Reads the appropriate instruction table to determine whether + * the ModR/M byte is required to decode a particular instruction. + * + * @param type - The opcode type (i.e., how many bytes it has). + * @param insnContext - The context for the instruction, as returned by + * contextForAttrs. + * @param opcode - The last byte of the instruction's opcode, not counting + * ModR/M extensions and escapes. + * @return - TRUE if the ModR/M byte is required, FALSE otherwise. + */ +static int modRMRequired(OpcodeType type, + InstructionContext insnContext, + uint8_t opcode) { + const struct ContextDecision* decision = 0; + + switch (type) { + case ONEBYTE: + decision = &ONEBYTE_SYM; + break; + case TWOBYTE: + decision = &TWOBYTE_SYM; + break; + case THREEBYTE_38: + decision = &THREEBYTE38_SYM; + break; + case THREEBYTE_3A: + decision = &THREEBYTE3A_SYM; + break; + case THREEBYTE_A6: + decision = &THREEBYTEA6_SYM; + break; + case THREEBYTE_A7: + decision = &THREEBYTEA7_SYM; + break; + } + + return decision->opcodeDecisions[insnContext].modRMDecisions[opcode]. + modrm_type != MODRM_ONEENTRY; + + return 0; +} + +/* + * decode - Reads the appropriate instruction table to obtain the unique ID of + * an instruction. + * + * @param type - See modRMRequired(). + * @param insnContext - See modRMRequired(). + * @param opcode - See modRMRequired(). + * @param modRM - The ModR/M byte if required, or any value if not. + * @return - The UID of the instruction, or 0 on failure. + */ +static InstrUID decode(OpcodeType type, + InstructionContext insnContext, + uint8_t opcode, + uint8_t modRM) { + const struct ModRMDecision* dec; + + switch (type) { + default: + debug("Unknown opcode type"); + return 0; + case ONEBYTE: + dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case TWOBYTE: + dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case THREEBYTE_38: + dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case THREEBYTE_3A: + dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case THREEBYTE_A6: + dec = &THREEBYTEA6_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case THREEBYTE_A7: + dec = &THREEBYTEA7_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + } + + switch (dec->modrm_type) { + default: + debug("Corrupt table! Unknown modrm_type"); + return 0; + case MODRM_ONEENTRY: + return dec->instructionIDs[0]; + case MODRM_SPLITRM: + if (modFromModRM(modRM) == 0x3) + return dec->instructionIDs[1]; + else + return dec->instructionIDs[0]; + case MODRM_FULL: + return dec->instructionIDs[modRM]; + } +} + +/* + * specifierForUID - Given a UID, returns the name and operand specification for + * that instruction. + * + * @param uid - The unique ID for the instruction. This should be returned by + * decode(); specifierForUID will not check bounds. + * @return - A pointer to the specification for that instruction. + */ +static const struct InstructionSpecifier *specifierForUID(InstrUID uid) { + return &INSTRUCTIONS_SYM[uid]; +} + +/* + * consumeByte - Uses the reader function provided by the user to consume one + * byte from the instruction's memory and advance the cursor. + * + * @param insn - The instruction with the reader function to use. The cursor + * for this instruction is advanced. + * @param byte - A pointer to a pre-allocated memory buffer to be populated + * with the data read. + * @return - 0 if the read was successful; nonzero otherwise. + */ +static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) { + int ret = insn->reader(insn->readerArg, byte, insn->readerCursor); + + if (!ret) + ++(insn->readerCursor); + + return ret; +} + +/* + * lookAtByte - Like consumeByte, but does not advance the cursor. + * + * @param insn - See consumeByte(). + * @param byte - See consumeByte(). + * @return - See consumeByte(). + */ +static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) { + return insn->reader(insn->readerArg, byte, insn->readerCursor); +} + +static void unconsumeByte(struct InternalInstruction* insn) { + insn->readerCursor--; +} + +#define CONSUME_FUNC(name, type) \ + static int name(struct InternalInstruction* insn, type* ptr) { \ + type combined = 0; \ + unsigned offset; \ + for (offset = 0; offset < sizeof(type); ++offset) { \ + uint8_t byte; \ + int ret = insn->reader(insn->readerArg, \ + &byte, \ + insn->readerCursor + offset); \ + if (ret) \ + return ret; \ + combined = combined | ((type)byte << ((type)offset * 8)); \ + } \ + *ptr = combined; \ + insn->readerCursor += sizeof(type); \ + return 0; \ + } + +/* + * consume* - Use the reader function provided by the user to consume data + * values of various sizes from the instruction's memory and advance the + * cursor appropriately. These readers perform endian conversion. + * + * @param insn - See consumeByte(). + * @param ptr - A pointer to a pre-allocated memory of appropriate size to + * be populated with the data read. + * @return - See consumeByte(). + */ +CONSUME_FUNC(consumeInt8, int8_t) +CONSUME_FUNC(consumeInt16, int16_t) +CONSUME_FUNC(consumeInt32, int32_t) +CONSUME_FUNC(consumeUInt16, uint16_t) +CONSUME_FUNC(consumeUInt32, uint32_t) +CONSUME_FUNC(consumeUInt64, uint64_t) + +/* + * dbgprintf - Uses the logging function provided by the user to log a single + * message, typically without a carriage-return. + * + * @param insn - The instruction containing the logging function. + * @param format - See printf(). + * @param ... - See printf(). + */ +static void dbgprintf(struct InternalInstruction* insn, + const char* format, + ...) { + char buffer[256]; + va_list ap; + + if (!insn->dlog) + return; + + va_start(ap, format); + (void)vsnprintf(buffer, sizeof(buffer), format, ap); + va_end(ap); + + insn->dlog(insn->dlogArg, buffer); + + return; +} + +/* + * setPrefixPresent - Marks that a particular prefix is present at a particular + * location. + * + * @param insn - The instruction to be marked as having the prefix. + * @param prefix - The prefix that is present. + * @param location - The location where the prefix is located (in the address + * space of the instruction's reader). + */ +static void setPrefixPresent(struct InternalInstruction* insn, + uint8_t prefix, + uint64_t location) +{ + insn->prefixPresent[prefix] = 1; + insn->prefixLocations[prefix] = location; +} + +/* + * isPrefixAtLocation - Queries an instruction to determine whether a prefix is + * present at a given location. + * + * @param insn - The instruction to be queried. + * @param prefix - The prefix. + * @param location - The location to query. + * @return - Whether the prefix is at that location. + */ +static BOOL isPrefixAtLocation(struct InternalInstruction* insn, + uint8_t prefix, + uint64_t location) +{ + if (insn->prefixPresent[prefix] == 1 && + insn->prefixLocations[prefix] == location) + return TRUE; + else + return FALSE; +} + +/* + * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the + * instruction as having them. Also sets the instruction's default operand, + * address, and other relevant data sizes to report operands correctly. + * + * @param insn - The instruction whose prefixes are to be read. + * @return - 0 if the instruction could be read until the end of the prefix + * bytes, and no prefixes conflicted; nonzero otherwise. + */ +static int readPrefixes(struct InternalInstruction* insn) { + BOOL isPrefix = TRUE; + BOOL prefixGroups[4] = { FALSE }; + uint64_t prefixLocation; + uint8_t byte = 0; + + BOOL hasAdSize = FALSE; + BOOL hasOpSize = FALSE; + + dbgprintf(insn, "readPrefixes()"); + + while (isPrefix) { + prefixLocation = insn->readerCursor; + + if (consumeByte(insn, &byte)) + return -1; + + switch (byte) { + case 0xf0: /* LOCK */ + case 0xf2: /* REPNE/REPNZ */ + case 0xf3: /* REP or REPE/REPZ */ + if (prefixGroups[0]) + dbgprintf(insn, "Redundant Group 1 prefix"); + prefixGroups[0] = TRUE; + setPrefixPresent(insn, byte, prefixLocation); + break; + case 0x2e: /* CS segment override -OR- Branch not taken */ + case 0x36: /* SS segment override -OR- Branch taken */ + case 0x3e: /* DS segment override */ + case 0x26: /* ES segment override */ + case 0x64: /* FS segment override */ + case 0x65: /* GS segment override */ + switch (byte) { + case 0x2e: + insn->segmentOverride = SEG_OVERRIDE_CS; + break; + case 0x36: + insn->segmentOverride = SEG_OVERRIDE_SS; + break; + case 0x3e: + insn->segmentOverride = SEG_OVERRIDE_DS; + break; + case 0x26: + insn->segmentOverride = SEG_OVERRIDE_ES; + break; + case 0x64: + insn->segmentOverride = SEG_OVERRIDE_FS; + break; + case 0x65: + insn->segmentOverride = SEG_OVERRIDE_GS; + break; + default: + debug("Unhandled override"); + return -1; + } + if (prefixGroups[1]) + dbgprintf(insn, "Redundant Group 2 prefix"); + prefixGroups[1] = TRUE; + setPrefixPresent(insn, byte, prefixLocation); + break; + case 0x66: /* Operand-size override */ + if (prefixGroups[2]) + dbgprintf(insn, "Redundant Group 3 prefix"); + prefixGroups[2] = TRUE; + hasOpSize = TRUE; + setPrefixPresent(insn, byte, prefixLocation); + break; + case 0x67: /* Address-size override */ + if (prefixGroups[3]) + dbgprintf(insn, "Redundant Group 4 prefix"); + prefixGroups[3] = TRUE; + hasAdSize = TRUE; + setPrefixPresent(insn, byte, prefixLocation); + break; + default: /* Not a prefix byte */ + isPrefix = FALSE; + break; + } + + if (isPrefix) + dbgprintf(insn, "Found prefix 0x%hhx", byte); + } + + insn->vexSize = 0; + + if (byte == 0xc4) { + uint8_t byte1; + + if (lookAtByte(insn, &byte1)) { + dbgprintf(insn, "Couldn't read second byte of VEX"); + return -1; + } + + if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { + insn->vexSize = 3; + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + else { + unconsumeByte(insn); + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + + if (insn->vexSize == 3) { + insn->vexPrefix[0] = byte; + consumeByte(insn, &insn->vexPrefix[1]); + consumeByte(insn, &insn->vexPrefix[2]); + + /* We simulate the REX prefix for simplicity's sake */ + + if (insn->mode == MODE_64BIT) { + insn->rexPrefix = 0x40 + | (wFromVEX3of3(insn->vexPrefix[2]) << 3) + | (rFromVEX2of3(insn->vexPrefix[1]) << 2) + | (xFromVEX2of3(insn->vexPrefix[1]) << 1) + | (bFromVEX2of3(insn->vexPrefix[1]) << 0); + } + + switch (ppFromVEX3of3(insn->vexPrefix[2])) + { + default: + break; + case VEX_PREFIX_66: + hasOpSize = TRUE; + break; + } + + dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1], insn->vexPrefix[2]); + } + } + else if (byte == 0xc5) { + uint8_t byte1; + + if (lookAtByte(insn, &byte1)) { + dbgprintf(insn, "Couldn't read second byte of VEX"); + return -1; + } + + if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { + insn->vexSize = 2; + } + else { + unconsumeByte(insn); + } + + if (insn->vexSize == 2) { + insn->vexPrefix[0] = byte; + consumeByte(insn, &insn->vexPrefix[1]); + + if (insn->mode == MODE_64BIT) { + insn->rexPrefix = 0x40 + | (rFromVEX2of2(insn->vexPrefix[1]) << 2); + } + + switch (ppFromVEX2of2(insn->vexPrefix[1])) + { + default: + break; + case VEX_PREFIX_66: + hasOpSize = TRUE; + break; + } + + dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1]); + } + } + else { + if (insn->mode == MODE_64BIT) { + if ((byte & 0xf0) == 0x40) { + uint8_t opcodeByte; + + if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) { + dbgprintf(insn, "Redundant REX prefix"); + return -1; + } + + insn->rexPrefix = byte; + insn->necessaryPrefixLocation = insn->readerCursor - 2; + + dbgprintf(insn, "Found REX prefix 0x%hhx", byte); + } else { + unconsumeByte(insn); + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + } else { + unconsumeByte(insn); + insn->necessaryPrefixLocation = insn->readerCursor - 1; + } + } + + if (insn->mode == MODE_16BIT) { + insn->registerSize = (hasOpSize ? 4 : 2); + insn->addressSize = (hasAdSize ? 4 : 2); + insn->displacementSize = (hasAdSize ? 4 : 2); + insn->immediateSize = (hasOpSize ? 4 : 2); + } else if (insn->mode == MODE_32BIT) { + insn->registerSize = (hasOpSize ? 2 : 4); + insn->addressSize = (hasAdSize ? 2 : 4); + insn->displacementSize = (hasAdSize ? 2 : 4); + insn->immediateSize = (hasOpSize ? 2 : 4); + } else if (insn->mode == MODE_64BIT) { + if (insn->rexPrefix && wFromREX(insn->rexPrefix)) { + insn->registerSize = 8; + insn->addressSize = (hasAdSize ? 4 : 8); + insn->displacementSize = 4; + insn->immediateSize = 4; + } else if (insn->rexPrefix) { + insn->registerSize = (hasOpSize ? 2 : 4); + insn->addressSize = (hasAdSize ? 4 : 8); + insn->displacementSize = (hasOpSize ? 2 : 4); + insn->immediateSize = (hasOpSize ? 2 : 4); + } else { + insn->registerSize = (hasOpSize ? 2 : 4); + insn->addressSize = (hasAdSize ? 4 : 8); + insn->displacementSize = (hasOpSize ? 2 : 4); + insn->immediateSize = (hasOpSize ? 2 : 4); + } + } + + return 0; +} + +/* + * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of + * extended or escape opcodes). + * + * @param insn - The instruction whose opcode is to be read. + * @return - 0 if the opcode could be read successfully; nonzero otherwise. + */ +static int readOpcode(struct InternalInstruction* insn) { + /* Determine the length of the primary opcode */ + + uint8_t current; + + dbgprintf(insn, "readOpcode()"); + + insn->opcodeType = ONEBYTE; + + if (insn->vexSize == 3) + { + switch (mmmmmFromVEX2of3(insn->vexPrefix[1])) + { + default: + dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", mmmmmFromVEX2of3(insn->vexPrefix[1])); + return -1; + case 0: + break; + case VEX_LOB_0F: + insn->twoByteEscape = 0x0f; + insn->opcodeType = TWOBYTE; + return consumeByte(insn, &insn->opcode); + case VEX_LOB_0F38: + insn->twoByteEscape = 0x0f; + insn->threeByteEscape = 0x38; + insn->opcodeType = THREEBYTE_38; + return consumeByte(insn, &insn->opcode); + case VEX_LOB_0F3A: + insn->twoByteEscape = 0x0f; + insn->threeByteEscape = 0x3a; + insn->opcodeType = THREEBYTE_3A; + return consumeByte(insn, &insn->opcode); + } + } + else if (insn->vexSize == 2) + { + insn->twoByteEscape = 0x0f; + insn->opcodeType = TWOBYTE; + return consumeByte(insn, &insn->opcode); + } + + if (consumeByte(insn, ¤t)) + return -1; + + if (current == 0x0f) { + dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current); + + insn->twoByteEscape = current; + + if (consumeByte(insn, ¤t)) + return -1; + + if (current == 0x38) { + dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); + + insn->threeByteEscape = current; + + if (consumeByte(insn, ¤t)) + return -1; + + insn->opcodeType = THREEBYTE_38; + } else if (current == 0x3a) { + dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); + + insn->threeByteEscape = current; + + if (consumeByte(insn, ¤t)) + return -1; + + insn->opcodeType = THREEBYTE_3A; + } else if (current == 0xa6) { + dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); + + insn->threeByteEscape = current; + + if (consumeByte(insn, ¤t)) + return -1; + + insn->opcodeType = THREEBYTE_A6; + } else if (current == 0xa7) { + dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); + + insn->threeByteEscape = current; + + if (consumeByte(insn, ¤t)) + return -1; + + insn->opcodeType = THREEBYTE_A7; + } else { + dbgprintf(insn, "Didn't find a three-byte escape prefix"); + + insn->opcodeType = TWOBYTE; + } + } + + /* + * At this point we have consumed the full opcode. + * Anything we consume from here on must be unconsumed. + */ + + insn->opcode = current; + + return 0; +} + +static int readModRM(struct InternalInstruction* insn); + +/* + * getIDWithAttrMask - Determines the ID of an instruction, consuming + * the ModR/M byte as appropriate for extended and escape opcodes, + * and using a supplied attribute mask. + * + * @param instructionID - A pointer whose target is filled in with the ID of the + * instruction. + * @param insn - The instruction whose ID is to be determined. + * @param attrMask - The attribute mask to search. + * @return - 0 if the ModR/M could be read when needed or was not + * needed; nonzero otherwise. + */ +static int getIDWithAttrMask(uint16_t* instructionID, + struct InternalInstruction* insn, + uint8_t attrMask) { + BOOL hasModRMExtension; + + uint8_t instructionClass; + + instructionClass = contextForAttrs(attrMask); + + hasModRMExtension = modRMRequired(insn->opcodeType, + instructionClass, + insn->opcode); + + if (hasModRMExtension) { + if (readModRM(insn)) + return -1; + + *instructionID = decode(insn->opcodeType, + instructionClass, + insn->opcode, + insn->modRM); + } else { + *instructionID = decode(insn->opcodeType, + instructionClass, + insn->opcode, + 0); + } + + return 0; +} + +/* + * is16BitEquivalent - Determines whether two instruction names refer to + * equivalent instructions but one is 16-bit whereas the other is not. + * + * @param orig - The instruction that is not 16-bit + * @param equiv - The instruction that is 16-bit + */ +static BOOL is16BitEquvalent(const char* orig, const char* equiv) { + off_t i; + + for (i = 0;; i++) { + if (orig[i] == '\0' && equiv[i] == '\0') + return TRUE; + if (orig[i] == '\0' || equiv[i] == '\0') + return FALSE; + if (orig[i] != equiv[i]) { + if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W') + continue; + if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1') + continue; + if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6') + continue; + return FALSE; + } + } +} + +/* + * getID - Determines the ID of an instruction, consuming the ModR/M byte as + * appropriate for extended and escape opcodes. Determines the attributes and + * context for the instruction before doing so. + * + * @param insn - The instruction whose ID is to be determined. + * @return - 0 if the ModR/M could be read when needed or was not needed; + * nonzero otherwise. + */ +static int getID(struct InternalInstruction* insn) { + uint8_t attrMask; + uint16_t instructionID; + + dbgprintf(insn, "getID()"); + + attrMask = ATTR_NONE; + + if (insn->mode == MODE_64BIT) + attrMask |= ATTR_64BIT; + + if (insn->vexSize) { + attrMask |= ATTR_VEX; + + if (insn->vexSize == 3) { + switch (ppFromVEX3of3(insn->vexPrefix[2])) { + case VEX_PREFIX_66: + attrMask |= ATTR_OPSIZE; + break; + case VEX_PREFIX_F3: + attrMask |= ATTR_XS; + break; + case VEX_PREFIX_F2: + attrMask |= ATTR_XD; + break; + } + + if (lFromVEX3of3(insn->vexPrefix[2])) + attrMask |= ATTR_VEXL; + } + else if (insn->vexSize == 2) { + switch (ppFromVEX2of2(insn->vexPrefix[1])) { + case VEX_PREFIX_66: + attrMask |= ATTR_OPSIZE; + break; + case VEX_PREFIX_F3: + attrMask |= ATTR_XS; + break; + case VEX_PREFIX_F2: + attrMask |= ATTR_XD; + break; + } + + if (lFromVEX2of2(insn->vexPrefix[1])) + attrMask |= ATTR_VEXL; + } + else { + return -1; + } + } + else { + if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) + attrMask |= ATTR_OPSIZE; + else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation)) + attrMask |= ATTR_XS; + else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation)) + attrMask |= ATTR_XD; + } + + if (insn->rexPrefix & 0x08) + attrMask |= ATTR_REXW; + + if (getIDWithAttrMask(&instructionID, insn, attrMask)) + return -1; + + /* The following clauses compensate for limitations of the tables. */ + + if ((attrMask & ATTR_VEXL) && (attrMask & ATTR_REXW)) { + /* + * Some VEX instructions ignore the L-bit, but use the W-bit. Normally L-bit + * has precedence since there are no L-bit with W-bit entries in the tables. + * So if the L-bit isn't significant we should use the W-bit instead. + */ + + const struct InstructionSpecifier *spec; + uint16_t instructionIDWithWBit; + const struct InstructionSpecifier *specWithWBit; + + spec = specifierForUID(instructionID); + + if (getIDWithAttrMask(&instructionIDWithWBit, + insn, + (attrMask & (~ATTR_VEXL)) | ATTR_REXW)) { + insn->instructionID = instructionID; + insn->spec = spec; + return 0; + } + + specWithWBit = specifierForUID(instructionIDWithWBit); + + if (instructionID != instructionIDWithWBit) { + insn->instructionID = instructionIDWithWBit; + insn->spec = specWithWBit; + } else { + insn->instructionID = instructionID; + insn->spec = spec; + } + return 0; + } + + if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) { + /* + * The instruction tables make no distinction between instructions that + * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a + * particular spot (i.e., many MMX operations). In general we're + * conservative, but in the specific case where OpSize is present but not + * in the right place we check if there's a 16-bit operation. + */ + + const struct InstructionSpecifier *spec; + uint16_t instructionIDWithOpsize; + const struct InstructionSpecifier *specWithOpsize; + + spec = specifierForUID(instructionID); + + if (getIDWithAttrMask(&instructionIDWithOpsize, + insn, + attrMask | ATTR_OPSIZE)) { + /* + * ModRM required with OpSize but not present; give up and return version + * without OpSize set + */ + + insn->instructionID = instructionID; + insn->spec = spec; + return 0; + } + + specWithOpsize = specifierForUID(instructionIDWithOpsize); + + if (is16BitEquvalent(spec->name, specWithOpsize->name)) { + insn->instructionID = instructionIDWithOpsize; + insn->spec = specWithOpsize; + } else { + insn->instructionID = instructionID; + insn->spec = spec; + } + return 0; + } + + if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 && + insn->rexPrefix & 0x01) { + /* + * NOOP shouldn't decode as NOOP if REX.b is set. Instead + * it should decode as XCHG %r8, %eax. + */ + + const struct InstructionSpecifier *spec; + uint16_t instructionIDWithNewOpcode; + const struct InstructionSpecifier *specWithNewOpcode; + + spec = specifierForUID(instructionID); + + /* Borrow opcode from one of the other XCHGar opcodes */ + insn->opcode = 0x91; + + if (getIDWithAttrMask(&instructionIDWithNewOpcode, + insn, + attrMask)) { + insn->opcode = 0x90; + + insn->instructionID = instructionID; + insn->spec = spec; + return 0; + } + + specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode); + + /* Change back */ + insn->opcode = 0x90; + + insn->instructionID = instructionIDWithNewOpcode; + insn->spec = specWithNewOpcode; + + return 0; + } + + insn->instructionID = instructionID; + insn->spec = specifierForUID(insn->instructionID); + + return 0; +} + +/* + * readSIB - Consumes the SIB byte to determine addressing information for an + * instruction. + * + * @param insn - The instruction whose SIB byte is to be read. + * @return - 0 if the SIB byte was successfully read; nonzero otherwise. + */ +static int readSIB(struct InternalInstruction* insn) { + SIBIndex sibIndexBase = 0; + SIBBase sibBaseBase = 0; + uint8_t index, base; + + dbgprintf(insn, "readSIB()"); + + if (insn->consumedSIB) + return 0; + + insn->consumedSIB = TRUE; + + switch (insn->addressSize) { + case 2: + dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode"); + return -1; + break; + case 4: + sibIndexBase = SIB_INDEX_EAX; + sibBaseBase = SIB_BASE_EAX; + break; + case 8: + sibIndexBase = SIB_INDEX_RAX; + sibBaseBase = SIB_BASE_RAX; + break; + } + + if (consumeByte(insn, &insn->sib)) + return -1; + + index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3); + + switch (index) { + case 0x4: + insn->sibIndex = SIB_INDEX_NONE; + break; + default: + insn->sibIndex = (SIBIndex)(sibIndexBase + index); + if (insn->sibIndex == SIB_INDEX_sib || + insn->sibIndex == SIB_INDEX_sib64) + insn->sibIndex = SIB_INDEX_NONE; + break; + } + + switch (scaleFromSIB(insn->sib)) { + case 0: + insn->sibScale = 1; + break; + case 1: + insn->sibScale = 2; + break; + case 2: + insn->sibScale = 4; + break; + case 3: + insn->sibScale = 8; + break; + } + + base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3); + + switch (base) { + case 0x5: + switch (modFromModRM(insn->modRM)) { + case 0x0: + insn->eaDisplacement = EA_DISP_32; + insn->sibBase = SIB_BASE_NONE; + break; + case 0x1: + insn->eaDisplacement = EA_DISP_8; + insn->sibBase = (insn->addressSize == 4 ? + SIB_BASE_EBP : SIB_BASE_RBP); + break; + case 0x2: + insn->eaDisplacement = EA_DISP_32; + insn->sibBase = (insn->addressSize == 4 ? + SIB_BASE_EBP : SIB_BASE_RBP); + break; + case 0x3: + debug("Cannot have Mod = 0b11 and a SIB byte"); + return -1; + } + break; + default: + insn->sibBase = (SIBBase)(sibBaseBase + base); + break; + } + + return 0; +} + +/* + * readDisplacement - Consumes the displacement of an instruction. + * + * @param insn - The instruction whose displacement is to be read. + * @return - 0 if the displacement byte was successfully read; nonzero + * otherwise. + */ +static int readDisplacement(struct InternalInstruction* insn) { + int8_t d8; + int16_t d16; + int32_t d32; + + dbgprintf(insn, "readDisplacement()"); + + if (insn->consumedDisplacement) + return 0; + + insn->consumedDisplacement = TRUE; + + switch (insn->eaDisplacement) { + case EA_DISP_NONE: + insn->consumedDisplacement = FALSE; + break; + case EA_DISP_8: + if (consumeInt8(insn, &d8)) + return -1; + insn->displacement = d8; + break; + case EA_DISP_16: + if (consumeInt16(insn, &d16)) + return -1; + insn->displacement = d16; + break; + case EA_DISP_32: + if (consumeInt32(insn, &d32)) + return -1; + insn->displacement = d32; + break; + } + + insn->consumedDisplacement = TRUE; + return 0; +} + +/* + * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and + * displacement) for an instruction and interprets it. + * + * @param insn - The instruction whose addressing information is to be read. + * @return - 0 if the information was successfully read; nonzero otherwise. + */ +static int readModRM(struct InternalInstruction* insn) { + uint8_t mod, rm, reg; + + dbgprintf(insn, "readModRM()"); + + if (insn->consumedModRM) + return 0; + + if (consumeByte(insn, &insn->modRM)) + return -1; + insn->consumedModRM = TRUE; + + mod = modFromModRM(insn->modRM); + rm = rmFromModRM(insn->modRM); + reg = regFromModRM(insn->modRM); + + /* + * This goes by insn->registerSize to pick the correct register, which messes + * up if we're using (say) XMM or 8-bit register operands. That gets fixed in + * fixupReg(). + */ + switch (insn->registerSize) { + case 2: + insn->regBase = MODRM_REG_AX; + insn->eaRegBase = EA_REG_AX; + break; + case 4: + insn->regBase = MODRM_REG_EAX; + insn->eaRegBase = EA_REG_EAX; + break; + case 8: + insn->regBase = MODRM_REG_RAX; + insn->eaRegBase = EA_REG_RAX; + break; + } + + reg |= rFromREX(insn->rexPrefix) << 3; + rm |= bFromREX(insn->rexPrefix) << 3; + + insn->reg = (Reg)(insn->regBase + reg); + + switch (insn->addressSize) { + case 2: + insn->eaBaseBase = EA_BASE_BX_SI; + + switch (mod) { + case 0x0: + if (rm == 0x6) { + insn->eaBase = EA_BASE_NONE; + insn->eaDisplacement = EA_DISP_16; + if (readDisplacement(insn)) + return -1; + } else { + insn->eaBase = (EABase)(insn->eaBaseBase + rm); + insn->eaDisplacement = EA_DISP_NONE; + } + break; + case 0x1: + insn->eaBase = (EABase)(insn->eaBaseBase + rm); + insn->eaDisplacement = EA_DISP_8; + if (readDisplacement(insn)) + return -1; + break; + case 0x2: + insn->eaBase = (EABase)(insn->eaBaseBase + rm); + insn->eaDisplacement = EA_DISP_16; + if (readDisplacement(insn)) + return -1; + break; + case 0x3: + insn->eaBase = (EABase)(insn->eaRegBase + rm); + if (readDisplacement(insn)) + return -1; + break; + } + break; + case 4: + case 8: + insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX); + + switch (mod) { + case 0x0: + insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */ + switch (rm) { + case 0x4: + case 0xc: /* in case REXW.b is set */ + insn->eaBase = (insn->addressSize == 4 ? + EA_BASE_sib : EA_BASE_sib64); + readSIB(insn); + if (readDisplacement(insn)) + return -1; + break; + case 0x5: + insn->eaBase = EA_BASE_NONE; + insn->eaDisplacement = EA_DISP_32; + if (readDisplacement(insn)) + return -1; + break; + default: + insn->eaBase = (EABase)(insn->eaBaseBase + rm); + break; + } + break; + case 0x1: + case 0x2: + insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32); + switch (rm) { + case 0x4: + case 0xc: /* in case REXW.b is set */ + insn->eaBase = EA_BASE_sib; + readSIB(insn); + if (readDisplacement(insn)) + return -1; + break; + default: + insn->eaBase = (EABase)(insn->eaBaseBase + rm); + if (readDisplacement(insn)) + return -1; + break; + } + break; + case 0x3: + insn->eaDisplacement = EA_DISP_NONE; + insn->eaBase = (EABase)(insn->eaRegBase + rm); + break; + } + break; + } /* switch (insn->addressSize) */ + + return 0; +} + +#define GENERIC_FIXUP_FUNC(name, base, prefix) \ + static uint8_t name(struct InternalInstruction *insn, \ + OperandType type, \ + uint8_t index, \ + uint8_t *valid) { \ + *valid = 1; \ + switch (type) { \ + default: \ + debug("Unhandled register type"); \ + *valid = 0; \ + return 0; \ + case TYPE_Rv: \ + return base + index; \ + case TYPE_R8: \ + if (insn->rexPrefix && \ + index >= 4 && index <= 7) { \ + return prefix##_SPL + (index - 4); \ + } else { \ + return prefix##_AL + index; \ + } \ + case TYPE_R16: \ + return prefix##_AX + index; \ + case TYPE_R32: \ + return prefix##_EAX + index; \ + case TYPE_R64: \ + return prefix##_RAX + index; \ + case TYPE_XMM256: \ + return prefix##_YMM0 + index; \ + case TYPE_XMM128: \ + case TYPE_XMM64: \ + case TYPE_XMM32: \ + case TYPE_XMM: \ + return prefix##_XMM0 + index; \ + case TYPE_MM64: \ + case TYPE_MM32: \ + case TYPE_MM: \ + if (index > 7) \ + *valid = 0; \ + return prefix##_MM0 + index; \ + case TYPE_SEGMENTREG: \ + if (index > 5) \ + *valid = 0; \ + return prefix##_ES + index; \ + case TYPE_DEBUGREG: \ + if (index > 7) \ + *valid = 0; \ + return prefix##_DR0 + index; \ + case TYPE_CONTROLREG: \ + if (index > 8) \ + *valid = 0; \ + return prefix##_CR0 + index; \ + } \ + } + +/* + * fixup*Value - Consults an operand type to determine the meaning of the + * reg or R/M field. If the operand is an XMM operand, for example, an + * operand would be XMM0 instead of AX, which readModRM() would otherwise + * misinterpret it as. + * + * @param insn - The instruction containing the operand. + * @param type - The operand type. + * @param index - The existing value of the field as reported by readModRM(). + * @param valid - The address of a uint8_t. The target is set to 1 if the + * field is valid for the register class; 0 if not. + * @return - The proper value. + */ +GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG) +GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG) + +/* + * fixupReg - Consults an operand specifier to determine which of the + * fixup*Value functions to use in correcting readModRM()'ss interpretation. + * + * @param insn - See fixup*Value(). + * @param op - The operand specifier. + * @return - 0 if fixup was successful; -1 if the register returned was + * invalid for its class. + */ +static int fixupReg(struct InternalInstruction *insn, + const struct OperandSpecifier *op) { + uint8_t valid; + + dbgprintf(insn, "fixupReg()"); + + switch ((OperandEncoding)op->encoding) { + default: + debug("Expected a REG or R/M encoding in fixupReg"); + return -1; + case ENCODING_VVVV: + insn->vvvv = (Reg)fixupRegValue(insn, + (OperandType)op->type, + insn->vvvv, + &valid); + if (!valid) + return -1; + break; + case ENCODING_REG: + insn->reg = (Reg)fixupRegValue(insn, + (OperandType)op->type, + insn->reg - insn->regBase, + &valid); + if (!valid) + return -1; + break; + case ENCODING_RM: + if (insn->eaBase >= insn->eaRegBase) { + insn->eaBase = (EABase)fixupRMValue(insn, + (OperandType)op->type, + insn->eaBase - insn->eaRegBase, + &valid); + if (!valid) + return -1; + } + break; + } + + return 0; +} + +/* + * readOpcodeModifier - Reads an operand from the opcode field of an + * instruction. Handles AddRegFrm instructions. + * + * @param insn - The instruction whose opcode field is to be read. + * @param inModRM - Indicates that the opcode field is to be read from the + * ModR/M extension; useful for escape opcodes + * @return - 0 on success; nonzero otherwise. + */ +static int readOpcodeModifier(struct InternalInstruction* insn) { + dbgprintf(insn, "readOpcodeModifier()"); + + if (insn->consumedOpcodeModifier) + return 0; + + insn->consumedOpcodeModifier = TRUE; + + switch (insn->spec->modifierType) { + default: + debug("Unknown modifier type."); + return -1; + case MODIFIER_NONE: + debug("No modifier but an operand expects one."); + return -1; + case MODIFIER_OPCODE: + insn->opcodeModifier = insn->opcode - insn->spec->modifierBase; + return 0; + case MODIFIER_MODRM: + insn->opcodeModifier = insn->modRM - insn->spec->modifierBase; + return 0; + } +} + +/* + * readOpcodeRegister - Reads an operand from the opcode field of an + * instruction and interprets it appropriately given the operand width. + * Handles AddRegFrm instructions. + * + * @param insn - See readOpcodeModifier(). + * @param size - The width (in bytes) of the register being specified. + * 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means + * RAX. + * @return - 0 on success; nonzero otherwise. + */ +static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) { + dbgprintf(insn, "readOpcodeRegister()"); + + if (readOpcodeModifier(insn)) + return -1; + + if (size == 0) + size = insn->registerSize; + + switch (size) { + case 1: + insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3) + | insn->opcodeModifier)); + if (insn->rexPrefix && + insn->opcodeRegister >= MODRM_REG_AL + 0x4 && + insn->opcodeRegister < MODRM_REG_AL + 0x8) { + insn->opcodeRegister = (Reg)(MODRM_REG_SPL + + (insn->opcodeRegister - MODRM_REG_AL - 4)); + } + + break; + case 2: + insn->opcodeRegister = (Reg)(MODRM_REG_AX + + ((bFromREX(insn->rexPrefix) << 3) + | insn->opcodeModifier)); + break; + case 4: + insn->opcodeRegister = (Reg)(MODRM_REG_EAX + + ((bFromREX(insn->rexPrefix) << 3) + | insn->opcodeModifier)); + break; + case 8: + insn->opcodeRegister = (Reg)(MODRM_REG_RAX + + ((bFromREX(insn->rexPrefix) << 3) + | insn->opcodeModifier)); + break; + } + + return 0; +} + +/* + * readImmediate - Consumes an immediate operand from an instruction, given the + * desired operand size. + * + * @param insn - The instruction whose operand is to be read. + * @param size - The width (in bytes) of the operand. + * @return - 0 if the immediate was successfully consumed; nonzero + * otherwise. + */ +static int readImmediate(struct InternalInstruction* insn, uint8_t size) { + uint8_t imm8; + uint16_t imm16; + uint32_t imm32; + uint64_t imm64; + + dbgprintf(insn, "readImmediate()"); + + if (insn->numImmediatesConsumed == 2) { + debug("Already consumed two immediates"); + return -1; + } + + if (size == 0) + size = insn->immediateSize; + else + insn->immediateSize = size; + + switch (size) { + case 1: + if (consumeByte(insn, &imm8)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm8; + break; + case 2: + if (consumeUInt16(insn, &imm16)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm16; + break; + case 4: + if (consumeUInt32(insn, &imm32)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm32; + break; + case 8: + if (consumeUInt64(insn, &imm64)) + return -1; + insn->immediates[insn->numImmediatesConsumed] = imm64; + break; + } + + insn->numImmediatesConsumed++; + + return 0; +} + +/* + * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix. + * + * @param insn - The instruction whose operand is to be read. + * @return - 0 if the vvvv was successfully consumed; nonzero + * otherwise. + */ +static int readVVVV(struct InternalInstruction* insn) { + dbgprintf(insn, "readVVVV()"); + + if (insn->vexSize == 3) + insn->vvvv = vvvvFromVEX3of3(insn->vexPrefix[2]); + else if (insn->vexSize == 2) + insn->vvvv = vvvvFromVEX2of2(insn->vexPrefix[1]); + else + return -1; + + if (insn->mode != MODE_64BIT) + insn->vvvv &= 0x7; + + return 0; +} + +/* + * readOperands - Consults the specifier for an instruction and consumes all + * operands for that instruction, interpreting them as it goes. + * + * @param insn - The instruction whose operands are to be read and interpreted. + * @return - 0 if all operands could be read; nonzero otherwise. + */ +static int readOperands(struct InternalInstruction* insn) { + int index; + int hasVVVV, needVVVV; + + dbgprintf(insn, "readOperands()"); + + /* If non-zero vvvv specified, need to make sure one of the operands + uses it. */ + hasVVVV = !readVVVV(insn); + needVVVV = hasVVVV && (insn->vvvv != 0); + + for (index = 0; index < X86_MAX_OPERANDS; ++index) { + switch (insn->spec->operands[index].encoding) { + case ENCODING_NONE: + break; + case ENCODING_REG: + case ENCODING_RM: + if (readModRM(insn)) + return -1; + if (fixupReg(insn, &insn->spec->operands[index])) + return -1; + break; + case ENCODING_CB: + case ENCODING_CW: + case ENCODING_CD: + case ENCODING_CP: + case ENCODING_CO: + case ENCODING_CT: + dbgprintf(insn, "We currently don't hande code-offset encodings"); + return -1; + case ENCODING_IB: + if (readImmediate(insn, 1)) + return -1; + if (insn->spec->operands[index].type == TYPE_IMM3 && + insn->immediates[insn->numImmediatesConsumed - 1] > 7) + return -1; + break; + case ENCODING_IW: + if (readImmediate(insn, 2)) + return -1; + break; + case ENCODING_ID: + if (readImmediate(insn, 4)) + return -1; + break; + case ENCODING_IO: + if (readImmediate(insn, 8)) + return -1; + break; + case ENCODING_Iv: + if (readImmediate(insn, insn->immediateSize)) + return -1; + break; + case ENCODING_Ia: + if (readImmediate(insn, insn->addressSize)) + return -1; + break; + case ENCODING_RB: + if (readOpcodeRegister(insn, 1)) + return -1; + break; + case ENCODING_RW: + if (readOpcodeRegister(insn, 2)) + return -1; + break; + case ENCODING_RD: + if (readOpcodeRegister(insn, 4)) + return -1; + break; + case ENCODING_RO: + if (readOpcodeRegister(insn, 8)) + return -1; + break; + case ENCODING_Rv: + if (readOpcodeRegister(insn, 0)) + return -1; + break; + case ENCODING_I: + if (readOpcodeModifier(insn)) + return -1; + break; + case ENCODING_VVVV: + needVVVV = 0; /* Mark that we have found a VVVV operand. */ + if (!hasVVVV) + return -1; + if (fixupReg(insn, &insn->spec->operands[index])) + return -1; + break; + case ENCODING_DUP: + break; + default: + dbgprintf(insn, "Encountered an operand with an unknown encoding."); + return -1; + } + } + + /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */ + if (needVVVV) return -1; + + return 0; +} + +/* + * decodeInstruction - Reads and interprets a full instruction provided by the + * user. + * + * @param insn - A pointer to the instruction to be populated. Must be + * pre-allocated. + * @param reader - The function to be used to read the instruction's bytes. + * @param readerArg - A generic argument to be passed to the reader to store + * any internal state. + * @param logger - If non-NULL, the function to be used to write log messages + * and warnings. + * @param loggerArg - A generic argument to be passed to the logger to store + * any internal state. + * @param startLoc - The address (in the reader's address space) of the first + * byte in the instruction. + * @param mode - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to + * decode the instruction in. + * @return - 0 if the instruction's memory could be read; nonzero if + * not. + */ +int decodeInstruction(struct InternalInstruction* insn, + byteReader_t reader, + void* readerArg, + dlog_t logger, + void* loggerArg, + uint64_t startLoc, + DisassemblerMode mode) { + memset(insn, 0, sizeof(struct InternalInstruction)); + + insn->reader = reader; + insn->readerArg = readerArg; + insn->dlog = logger; + insn->dlogArg = loggerArg; + insn->startLocation = startLoc; + insn->readerCursor = startLoc; + insn->mode = mode; + insn->numImmediatesConsumed = 0; + + if (readPrefixes(insn) || + readOpcode(insn) || + getID(insn) || + insn->instructionID == 0 || + readOperands(insn)) + return -1; + + insn->length = insn->readerCursor - insn->startLocation; + + dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu", + startLoc, insn->readerCursor, insn->length); + + if (insn->length > 15) + dbgprintf(insn, "Instruction exceeds 15-byte limit"); + + return 0; +} diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h new file mode 100644 index 0000000..a9c90f8 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -0,0 +1,575 @@ +/*===- X86DisassemblerDecoderInternal.h - Disassembler decoder -----*- C -*-==* + * + * The LLVM Compiler Infrastructure + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + * + *===----------------------------------------------------------------------===* + * + * This file is part of the X86 Disassembler. + * It contains the public interface of the instruction decoder. + * Documentation for the disassembler can be found in X86Disassembler.h. + * + *===----------------------------------------------------------------------===*/ + +#ifndef X86DISASSEMBLERDECODER_H +#define X86DISASSEMBLERDECODER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define INSTRUCTION_SPECIFIER_FIELDS \ + const char* name; + +#define INSTRUCTION_IDS \ + const InstrUID *instructionIDs; + +#include "X86DisassemblerDecoderCommon.h" + +#undef INSTRUCTION_SPECIFIER_FIELDS +#undef INSTRUCTION_IDS + +/* + * Accessor functions for various fields of an Intel instruction + */ +#define modFromModRM(modRM) (((modRM) & 0xc0) >> 6) +#define regFromModRM(modRM) (((modRM) & 0x38) >> 3) +#define rmFromModRM(modRM) ((modRM) & 0x7) +#define scaleFromSIB(sib) (((sib) & 0xc0) >> 6) +#define indexFromSIB(sib) (((sib) & 0x38) >> 3) +#define baseFromSIB(sib) ((sib) & 0x7) +#define wFromREX(rex) (((rex) & 0x8) >> 3) +#define rFromREX(rex) (((rex) & 0x4) >> 2) +#define xFromREX(rex) (((rex) & 0x2) >> 1) +#define bFromREX(rex) ((rex) & 0x1) + +#define rFromVEX2of3(vex) (((~(vex)) & 0x80) >> 7) +#define xFromVEX2of3(vex) (((~(vex)) & 0x40) >> 6) +#define bFromVEX2of3(vex) (((~(vex)) & 0x20) >> 5) +#define mmmmmFromVEX2of3(vex) ((vex) & 0x1f) +#define wFromVEX3of3(vex) (((vex) & 0x80) >> 7) +#define vvvvFromVEX3of3(vex) (((~(vex)) & 0x78) >> 3) +#define lFromVEX3of3(vex) (((vex) & 0x4) >> 2) +#define ppFromVEX3of3(vex) ((vex) & 0x3) + +#define rFromVEX2of2(vex) (((~(vex)) & 0x80) >> 7) +#define vvvvFromVEX2of2(vex) (((~(vex)) & 0x78) >> 3) +#define lFromVEX2of2(vex) (((vex) & 0x4) >> 2) +#define ppFromVEX2of2(vex) ((vex) & 0x3) + +/* + * These enums represent Intel registers for use by the decoder. + */ + +#define REGS_8BIT \ + ENTRY(AL) \ + ENTRY(CL) \ + ENTRY(DL) \ + ENTRY(BL) \ + ENTRY(AH) \ + ENTRY(CH) \ + ENTRY(DH) \ + ENTRY(BH) \ + ENTRY(R8B) \ + ENTRY(R9B) \ + ENTRY(R10B) \ + ENTRY(R11B) \ + ENTRY(R12B) \ + ENTRY(R13B) \ + ENTRY(R14B) \ + ENTRY(R15B) \ + ENTRY(SPL) \ + ENTRY(BPL) \ + ENTRY(SIL) \ + ENTRY(DIL) + +#define EA_BASES_16BIT \ + ENTRY(BX_SI) \ + ENTRY(BX_DI) \ + ENTRY(BP_SI) \ + ENTRY(BP_DI) \ + ENTRY(SI) \ + ENTRY(DI) \ + ENTRY(BP) \ + ENTRY(BX) \ + ENTRY(R8W) \ + ENTRY(R9W) \ + ENTRY(R10W) \ + ENTRY(R11W) \ + ENTRY(R12W) \ + ENTRY(R13W) \ + ENTRY(R14W) \ + ENTRY(R15W) + +#define REGS_16BIT \ + ENTRY(AX) \ + ENTRY(CX) \ + ENTRY(DX) \ + ENTRY(BX) \ + ENTRY(SP) \ + ENTRY(BP) \ + ENTRY(SI) \ + ENTRY(DI) \ + ENTRY(R8W) \ + ENTRY(R9W) \ + ENTRY(R10W) \ + ENTRY(R11W) \ + ENTRY(R12W) \ + ENTRY(R13W) \ + ENTRY(R14W) \ + ENTRY(R15W) + +#define EA_BASES_32BIT \ + ENTRY(EAX) \ + ENTRY(ECX) \ + ENTRY(EDX) \ + ENTRY(EBX) \ + ENTRY(sib) \ + ENTRY(EBP) \ + ENTRY(ESI) \ + ENTRY(EDI) \ + ENTRY(R8D) \ + ENTRY(R9D) \ + ENTRY(R10D) \ + ENTRY(R11D) \ + ENTRY(R12D) \ + ENTRY(R13D) \ + ENTRY(R14D) \ + ENTRY(R15D) + +#define REGS_32BIT \ + ENTRY(EAX) \ + ENTRY(ECX) \ + ENTRY(EDX) \ + ENTRY(EBX) \ + ENTRY(ESP) \ + ENTRY(EBP) \ + ENTRY(ESI) \ + ENTRY(EDI) \ + ENTRY(R8D) \ + ENTRY(R9D) \ + ENTRY(R10D) \ + ENTRY(R11D) \ + ENTRY(R12D) \ + ENTRY(R13D) \ + ENTRY(R14D) \ + ENTRY(R15D) + +#define EA_BASES_64BIT \ + ENTRY(RAX) \ + ENTRY(RCX) \ + ENTRY(RDX) \ + ENTRY(RBX) \ + ENTRY(sib64) \ + ENTRY(RBP) \ + ENTRY(RSI) \ + ENTRY(RDI) \ + ENTRY(R8) \ + ENTRY(R9) \ + ENTRY(R10) \ + ENTRY(R11) \ + ENTRY(R12) \ + ENTRY(R13) \ + ENTRY(R14) \ + ENTRY(R15) + +#define REGS_64BIT \ + ENTRY(RAX) \ + ENTRY(RCX) \ + ENTRY(RDX) \ + ENTRY(RBX) \ + ENTRY(RSP) \ + ENTRY(RBP) \ + ENTRY(RSI) \ + ENTRY(RDI) \ + ENTRY(R8) \ + ENTRY(R9) \ + ENTRY(R10) \ + ENTRY(R11) \ + ENTRY(R12) \ + ENTRY(R13) \ + ENTRY(R14) \ + ENTRY(R15) + +#define REGS_MMX \ + ENTRY(MM0) \ + ENTRY(MM1) \ + ENTRY(MM2) \ + ENTRY(MM3) \ + ENTRY(MM4) \ + ENTRY(MM5) \ + ENTRY(MM6) \ + ENTRY(MM7) + +#define REGS_XMM \ + ENTRY(XMM0) \ + ENTRY(XMM1) \ + ENTRY(XMM2) \ + ENTRY(XMM3) \ + ENTRY(XMM4) \ + ENTRY(XMM5) \ + ENTRY(XMM6) \ + ENTRY(XMM7) \ + ENTRY(XMM8) \ + ENTRY(XMM9) \ + ENTRY(XMM10) \ + ENTRY(XMM11) \ + ENTRY(XMM12) \ + ENTRY(XMM13) \ + ENTRY(XMM14) \ + ENTRY(XMM15) + +#define REGS_YMM \ + ENTRY(YMM0) \ + ENTRY(YMM1) \ + ENTRY(YMM2) \ + ENTRY(YMM3) \ + ENTRY(YMM4) \ + ENTRY(YMM5) \ + ENTRY(YMM6) \ + ENTRY(YMM7) \ + ENTRY(YMM8) \ + ENTRY(YMM9) \ + ENTRY(YMM10) \ + ENTRY(YMM11) \ + ENTRY(YMM12) \ + ENTRY(YMM13) \ + ENTRY(YMM14) \ + ENTRY(YMM15) + +#define REGS_SEGMENT \ + ENTRY(ES) \ + ENTRY(CS) \ + ENTRY(SS) \ + ENTRY(DS) \ + ENTRY(FS) \ + ENTRY(GS) + +#define REGS_DEBUG \ + ENTRY(DR0) \ + ENTRY(DR1) \ + ENTRY(DR2) \ + ENTRY(DR3) \ + ENTRY(DR4) \ + ENTRY(DR5) \ + ENTRY(DR6) \ + ENTRY(DR7) + +#define REGS_CONTROL \ + ENTRY(CR0) \ + ENTRY(CR1) \ + ENTRY(CR2) \ + ENTRY(CR3) \ + ENTRY(CR4) \ + ENTRY(CR5) \ + ENTRY(CR6) \ + ENTRY(CR7) \ + ENTRY(CR8) + +#define ALL_EA_BASES \ + EA_BASES_16BIT \ + EA_BASES_32BIT \ + EA_BASES_64BIT + +#define ALL_SIB_BASES \ + REGS_32BIT \ + REGS_64BIT + +#define ALL_REGS \ + REGS_8BIT \ + REGS_16BIT \ + REGS_32BIT \ + REGS_64BIT \ + REGS_MMX \ + REGS_XMM \ + REGS_YMM \ + REGS_SEGMENT \ + REGS_DEBUG \ + REGS_CONTROL \ + ENTRY(RIP) + +/* + * EABase - All possible values of the base field for effective-address + * computations, a.k.a. the Mod and R/M fields of the ModR/M byte. We + * distinguish between bases (EA_BASE_*) and registers that just happen to be + * referred to when Mod == 0b11 (EA_REG_*). + */ +typedef enum { + EA_BASE_NONE, +#define ENTRY(x) EA_BASE_##x, + ALL_EA_BASES +#undef ENTRY +#define ENTRY(x) EA_REG_##x, + ALL_REGS +#undef ENTRY + EA_max +} EABase; + +/* + * SIBIndex - All possible values of the SIB index field. + * Borrows entries from ALL_EA_BASES with the special case that + * sib is synonymous with NONE. + */ +typedef enum { + SIB_INDEX_NONE, +#define ENTRY(x) SIB_INDEX_##x, + ALL_EA_BASES +#undef ENTRY + SIB_INDEX_max +} SIBIndex; + +/* + * SIBBase - All possible values of the SIB base field. + */ +typedef enum { + SIB_BASE_NONE, +#define ENTRY(x) SIB_BASE_##x, + ALL_SIB_BASES +#undef ENTRY + SIB_BASE_max +} SIBBase; + +/* + * EADisplacement - Possible displacement types for effective-address + * computations. + */ +typedef enum { + EA_DISP_NONE, + EA_DISP_8, + EA_DISP_16, + EA_DISP_32 +} EADisplacement; + +/* + * Reg - All possible values of the reg field in the ModR/M byte. + */ +typedef enum { +#define ENTRY(x) MODRM_REG_##x, + ALL_REGS +#undef ENTRY + MODRM_REG_max +} Reg; + +/* + * SegmentOverride - All possible segment overrides. + */ +typedef enum { + SEG_OVERRIDE_NONE, + SEG_OVERRIDE_CS, + SEG_OVERRIDE_SS, + SEG_OVERRIDE_DS, + SEG_OVERRIDE_ES, + SEG_OVERRIDE_FS, + SEG_OVERRIDE_GS, + SEG_OVERRIDE_max +} SegmentOverride; + +/* + * VEXLeadingOpcodeByte - Possible values for the VEX.m-mmmm field + */ + +typedef enum { + VEX_LOB_0F = 0x1, + VEX_LOB_0F38 = 0x2, + VEX_LOB_0F3A = 0x3 +} VEXLeadingOpcodeByte; + +/* + * VEXPrefixCode - Possible values for the VEX.pp field + */ + +typedef enum { + VEX_PREFIX_NONE = 0x0, + VEX_PREFIX_66 = 0x1, + VEX_PREFIX_F3 = 0x2, + VEX_PREFIX_F2 = 0x3 +} VEXPrefixCode; + +typedef uint8_t BOOL; + +/* + * byteReader_t - Type for the byte reader that the consumer must provide to + * the decoder. Reads a single byte from the instruction's address space. + * @param arg - A baton that the consumer can associate with any internal + * state that it needs. + * @param byte - A pointer to a single byte in memory that should be set to + * contain the value at address. + * @param address - The address in the instruction's address space that should + * be read from. + * @return - -1 if the byte cannot be read for any reason; 0 otherwise. + */ +typedef int (*byteReader_t)(void* arg, uint8_t* byte, uint64_t address); + +/* + * dlog_t - Type for the logging function that the consumer can provide to + * get debugging output from the decoder. + * @param arg - A baton that the consumer can associate with any internal + * state that it needs. + * @param log - A string that contains the message. Will be reused after + * the logger returns. + */ +typedef void (*dlog_t)(void* arg, const char *log); + +/* + * The x86 internal instruction, which is produced by the decoder. + */ +struct InternalInstruction { + /* Reader interface (C) */ + byteReader_t reader; + /* Opaque value passed to the reader */ + void* readerArg; + /* The address of the next byte to read via the reader */ + uint64_t readerCursor; + + /* Logger interface (C) */ + dlog_t dlog; + /* Opaque value passed to the logger */ + void* dlogArg; + + /* General instruction information */ + + /* The mode to disassemble for (64-bit, protected, real) */ + DisassemblerMode mode; + /* The start of the instruction, usable with the reader */ + uint64_t startLocation; + /* The length of the instruction, in bytes */ + size_t length; + + /* Prefix state */ + + /* 1 if the prefix byte corresponding to the entry is present; 0 if not */ + uint8_t prefixPresent[0x100]; + /* contains the location (for use with the reader) of the prefix byte */ + uint64_t prefixLocations[0x100]; + /* The value of the VEX prefix, if present */ + uint8_t vexPrefix[3]; + /* The length of the VEX prefix (0 if not present) */ + uint8_t vexSize; + /* The value of the REX prefix, if present */ + uint8_t rexPrefix; + /* The location where a mandatory prefix would have to be (i.e., right before + the opcode, or right before the REX prefix if one is present) */ + uint64_t necessaryPrefixLocation; + /* The segment override type */ + SegmentOverride segmentOverride; + + /* Sizes of various critical pieces of data, in bytes */ + uint8_t registerSize; + uint8_t addressSize; + uint8_t displacementSize; + uint8_t immediateSize; + + /* opcode state */ + + /* The value of the two-byte escape prefix (usually 0x0f) */ + uint8_t twoByteEscape; + /* The value of the three-byte escape prefix (usually 0x38 or 0x3a) */ + uint8_t threeByteEscape; + /* The last byte of the opcode, not counting any ModR/M extension */ + uint8_t opcode; + /* The ModR/M byte of the instruction, if it is an opcode extension */ + uint8_t modRMExtension; + + /* decode state */ + + /* The type of opcode, used for indexing into the array of decode tables */ + OpcodeType opcodeType; + /* The instruction ID, extracted from the decode table */ + uint16_t instructionID; + /* The specifier for the instruction, from the instruction info table */ + const struct InstructionSpecifier *spec; + + /* state for additional bytes, consumed during operand decode. Pattern: + consumed___ indicates that the byte was already consumed and does not + need to be consumed again */ + + /* The VEX.vvvv field, which contains a third register operand for some AVX + instructions */ + Reg vvvv; + + /* The ModR/M byte, which contains most register operands and some portion of + all memory operands */ + BOOL consumedModRM; + uint8_t modRM; + + /* The SIB byte, used for more complex 32- or 64-bit memory operands */ + BOOL consumedSIB; + uint8_t sib; + + /* The displacement, used for memory operands */ + BOOL consumedDisplacement; + int32_t displacement; + + /* Immediates. There can be two in some cases */ + uint8_t numImmediatesConsumed; + uint8_t numImmediatesTranslated; + uint64_t immediates[2]; + + /* A register or immediate operand encoded into the opcode */ + BOOL consumedOpcodeModifier; + uint8_t opcodeModifier; + Reg opcodeRegister; + + /* Portions of the ModR/M byte */ + + /* These fields determine the allowable values for the ModR/M fields, which + depend on operand and address widths */ + EABase eaBaseBase; + EABase eaRegBase; + Reg regBase; + + /* The Mod and R/M fields can encode a base for an effective address, or a + register. These are separated into two fields here */ + EABase eaBase; + EADisplacement eaDisplacement; + /* The reg field always encodes a register */ + Reg reg; + + /* SIB state */ + SIBIndex sibIndex; + uint8_t sibScale; + SIBBase sibBase; +}; + +/* decodeInstruction - Decode one instruction and store the decoding results in + * a buffer provided by the consumer. + * @param insn - The buffer to store the instruction in. Allocated by the + * consumer. + * @param reader - The byteReader_t for the bytes to be read. + * @param readerArg - An argument to pass to the reader for storing context + * specific to the consumer. May be NULL. + * @param logger - The dlog_t to be used in printing status messages from the + * disassembler. May be NULL. + * @param loggerArg - An argument to pass to the logger for storing context + * specific to the logger. May be NULL. + * @param startLoc - The address (in the reader's address space) of the first + * byte in the instruction. + * @param mode - The mode (16-bit, 32-bit, 64-bit) to decode in. + * @return - Nonzero if there was an error during decode, 0 otherwise. + */ +int decodeInstruction(struct InternalInstruction* insn, + byteReader_t reader, + void* readerArg, + dlog_t logger, + void* loggerArg, + uint64_t startLoc, + DisassemblerMode mode); + +/* x86DisassemblerDebug - C-accessible function for printing a message to + * debugs() + * @param file - The name of the file printing the debug message. + * @param line - The line number that printed the debug message. + * @param s - The message to print. + */ + +void x86DisassemblerDebug(const char *file, + unsigned line, + const char *s); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h new file mode 100644 index 0000000..8b79335 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -0,0 +1,385 @@ +/*===- X86DisassemblerDecoderCommon.h - Disassembler decoder -------*- C -*-==* + * + * The LLVM Compiler Infrastructure + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + * + *===----------------------------------------------------------------------===* + * + * This file is part of the X86 Disassembler. + * It contains common definitions used by both the disassembler and the table + * generator. + * Documentation for the disassembler can be found in X86Disassembler.h. + * + *===----------------------------------------------------------------------===*/ + +/* + * This header file provides those definitions that need to be shared between + * the decoder and the table generator in a C-friendly manner. + */ + +#ifndef X86DISASSEMBLERDECODERCOMMON_H +#define X86DISASSEMBLERDECODERCOMMON_H + +#include "llvm/Support/DataTypes.h" + +#define INSTRUCTIONS_SYM x86DisassemblerInstrSpecifiers +#define CONTEXTS_SYM x86DisassemblerContexts +#define ONEBYTE_SYM x86DisassemblerOneByteOpcodes +#define TWOBYTE_SYM x86DisassemblerTwoByteOpcodes +#define THREEBYTE38_SYM x86DisassemblerThreeByte38Opcodes +#define THREEBYTE3A_SYM x86DisassemblerThreeByte3AOpcodes +#define THREEBYTEA6_SYM x86DisassemblerThreeByteA6Opcodes +#define THREEBYTEA7_SYM x86DisassemblerThreeByteA7Opcodes + +#define INSTRUCTIONS_STR "x86DisassemblerInstrSpecifiers" +#define CONTEXTS_STR "x86DisassemblerContexts" +#define ONEBYTE_STR "x86DisassemblerOneByteOpcodes" +#define TWOBYTE_STR "x86DisassemblerTwoByteOpcodes" +#define THREEBYTE38_STR "x86DisassemblerThreeByte38Opcodes" +#define THREEBYTE3A_STR "x86DisassemblerThreeByte3AOpcodes" +#define THREEBYTEA6_STR "x86DisassemblerThreeByteA6Opcodes" +#define THREEBYTEA7_STR "x86DisassemblerThreeByteA7Opcodes" + +/* + * Attributes of an instruction that must be known before the opcode can be + * processed correctly. Most of these indicate the presence of particular + * prefixes, but ATTR_64BIT is simply an attribute of the decoding context. + */ +#define ATTRIBUTE_BITS \ + ENUM_ENTRY(ATTR_NONE, 0x00) \ + ENUM_ENTRY(ATTR_64BIT, 0x01) \ + ENUM_ENTRY(ATTR_XS, 0x02) \ + ENUM_ENTRY(ATTR_XD, 0x04) \ + ENUM_ENTRY(ATTR_REXW, 0x08) \ + ENUM_ENTRY(ATTR_OPSIZE, 0x10) \ + ENUM_ENTRY(ATTR_VEX, 0x20) \ + ENUM_ENTRY(ATTR_VEXL, 0x40) + +#define ENUM_ENTRY(n, v) n = v, +enum attributeBits { + ATTRIBUTE_BITS + ATTR_max +}; +#undef ENUM_ENTRY + +/* + * Combinations of the above attributes that are relevant to instruction + * decode. Although other combinations are possible, they can be reduced to + * these without affecting the ultimately decoded instruction. + */ + +/* Class name Rank Rationale for rank assignment */ +#define INSTRUCTION_CONTEXTS \ + ENUM_ENTRY(IC, 0, "says nothing about the instruction") \ + ENUM_ENTRY(IC_64BIT, 1, "says the instruction applies in " \ + "64-bit mode but no more") \ + ENUM_ENTRY(IC_OPSIZE, 3, "requires an OPSIZE prefix, so " \ + "operands change width") \ + ENUM_ENTRY(IC_XD, 2, "may say something about the opcode " \ + "but not the operands") \ + ENUM_ENTRY(IC_XS, 2, "may say something about the opcode " \ + "but not the operands") \ + ENUM_ENTRY(IC_XD_OPSIZE, 3, "requires an OPSIZE prefix, so " \ + "operands change width") \ + ENUM_ENTRY(IC_XS_OPSIZE, 3, "requires an OPSIZE prefix, so " \ + "operands change width") \ + ENUM_ENTRY(IC_64BIT_REXW, 4, "requires a REX.W prefix, so operands "\ + "change width; overrides IC_OPSIZE") \ + ENUM_ENTRY(IC_64BIT_OPSIZE, 3, "Just as meaningful as IC_OPSIZE") \ + ENUM_ENTRY(IC_64BIT_XD, 5, "XD instructions are SSE; REX.W is " \ + "secondary") \ + ENUM_ENTRY(IC_64BIT_XS, 5, "Just as meaningful as IC_64BIT_XD") \ + ENUM_ENTRY(IC_64BIT_XD_OPSIZE, 3, "Just as meaningful as IC_XD_OPSIZE") \ + ENUM_ENTRY(IC_64BIT_XS_OPSIZE, 3, "Just as meaningful as IC_XS_OPSIZE") \ + ENUM_ENTRY(IC_64BIT_REXW_XS, 6, "OPSIZE could mean a different " \ + "opcode") \ + ENUM_ENTRY(IC_64BIT_REXW_XD, 6, "Just as meaningful as " \ + "IC_64BIT_REXW_XS") \ + ENUM_ENTRY(IC_64BIT_REXW_OPSIZE, 7, "The Dynamic Duo! Prefer over all " \ + "else because this changes most " \ + "operands' meaning") \ + ENUM_ENTRY(IC_VEX, 1, "requires a VEX prefix") \ + ENUM_ENTRY(IC_VEX_XS, 2, "requires VEX and the XS prefix") \ + ENUM_ENTRY(IC_VEX_XD, 2, "requires VEX and the XD prefix") \ + ENUM_ENTRY(IC_VEX_OPSIZE, 2, "requires VEX and the OpSize prefix") \ + ENUM_ENTRY(IC_VEX_W, 3, "requires VEX and the W prefix") \ + ENUM_ENTRY(IC_VEX_W_XS, 4, "requires VEX, W, and XS prefix") \ + ENUM_ENTRY(IC_VEX_W_XD, 4, "requires VEX, W, and XD prefix") \ + ENUM_ENTRY(IC_VEX_W_OPSIZE, 4, "requires VEX, W, and OpSize") \ + ENUM_ENTRY(IC_VEX_L, 3, "requires VEX and the L prefix") \ + ENUM_ENTRY(IC_VEX_L_XS, 4, "requires VEX and the L and XS prefix")\ + ENUM_ENTRY(IC_VEX_L_XD, 4, "requires VEX and the L and XD prefix")\ + ENUM_ENTRY(IC_VEX_L_OPSIZE, 4, "requires VEX, L, and OpSize") + + +#define ENUM_ENTRY(n, r, d) n, +typedef enum { + INSTRUCTION_CONTEXTS + IC_max +} InstructionContext; +#undef ENUM_ENTRY + +/* + * Opcode types, which determine which decode table to use, both in the Intel + * manual and also for the decoder. + */ +typedef enum { + ONEBYTE = 0, + TWOBYTE = 1, + THREEBYTE_38 = 2, + THREEBYTE_3A = 3, + THREEBYTE_A6 = 4, + THREEBYTE_A7 = 5 +} OpcodeType; + +/* + * The following structs are used for the hierarchical decode table. After + * determining the instruction's class (i.e., which IC_* constant applies to + * it), the decoder reads the opcode. Some instructions require specific + * values of the ModR/M byte, so the ModR/M byte indexes into the final table. + * + * If a ModR/M byte is not required, "required" is left unset, and the values + * for each instructionID are identical. + */ + +typedef uint16_t InstrUID; + +/* + * ModRMDecisionType - describes the type of ModR/M decision, allowing the + * consumer to determine the number of entries in it. + * + * MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded + * instruction is the same. + * MODRM_SPLITRM - If the ModR/M byte is between 0x00 and 0xbf, the opcode + * corresponds to one instruction; otherwise, it corresponds to + * a different instruction. + * MODRM_FULL - Potentially, each value of the ModR/M byte could correspond + * to a different instruction. + */ + +#define MODRMTYPES \ + ENUM_ENTRY(MODRM_ONEENTRY) \ + ENUM_ENTRY(MODRM_SPLITRM) \ + ENUM_ENTRY(MODRM_FULL) + +#define ENUM_ENTRY(n) n, +typedef enum { + MODRMTYPES + MODRM_max +} ModRMDecisionType; +#undef ENUM_ENTRY + +/* + * ModRMDecision - Specifies whether a ModR/M byte is needed and (if so) which + * instruction each possible value of the ModR/M byte corresponds to. Once + * this information is known, we have narrowed down to a single instruction. + */ +struct ModRMDecision { + uint8_t modrm_type; + + /* The macro below must be defined wherever this file is included. */ + INSTRUCTION_IDS +}; + +/* + * OpcodeDecision - Specifies which set of ModR/M->instruction tables to look at + * given a particular opcode. + */ +struct OpcodeDecision { + struct ModRMDecision modRMDecisions[256]; +}; + +/* + * ContextDecision - Specifies which opcode->instruction tables to look at given + * a particular context (set of attributes). Since there are many possible + * contexts, the decoder first uses CONTEXTS_SYM to determine which context + * applies given a specific set of attributes. Hence there are only IC_max + * entries in this table, rather than 2^(ATTR_max). + */ +struct ContextDecision { + struct OpcodeDecision opcodeDecisions[IC_max]; +}; + +/* + * Physical encodings of instruction operands. + */ + +#define ENCODINGS \ + ENUM_ENTRY(ENCODING_NONE, "") \ + ENUM_ENTRY(ENCODING_REG, "Register operand in ModR/M byte.") \ + ENUM_ENTRY(ENCODING_RM, "R/M operand in ModR/M byte.") \ + ENUM_ENTRY(ENCODING_VVVV, "Register operand in VEX.vvvv byte.") \ + ENUM_ENTRY(ENCODING_CB, "1-byte code offset (possible new CS value)") \ + ENUM_ENTRY(ENCODING_CW, "2-byte") \ + ENUM_ENTRY(ENCODING_CD, "4-byte") \ + ENUM_ENTRY(ENCODING_CP, "6-byte") \ + ENUM_ENTRY(ENCODING_CO, "8-byte") \ + ENUM_ENTRY(ENCODING_CT, "10-byte") \ + ENUM_ENTRY(ENCODING_IB, "1-byte immediate") \ + ENUM_ENTRY(ENCODING_IW, "2-byte") \ + ENUM_ENTRY(ENCODING_ID, "4-byte") \ + ENUM_ENTRY(ENCODING_IO, "8-byte") \ + ENUM_ENTRY(ENCODING_RB, "(AL..DIL, R8L..R15L) Register code added to " \ + "the opcode byte") \ + ENUM_ENTRY(ENCODING_RW, "(AX..DI, R8W..R15W)") \ + ENUM_ENTRY(ENCODING_RD, "(EAX..EDI, R8D..R15D)") \ + ENUM_ENTRY(ENCODING_RO, "(RAX..RDI, R8..R15)") \ + ENUM_ENTRY(ENCODING_I, "Position on floating-point stack added to the " \ + "opcode byte") \ + \ + ENUM_ENTRY(ENCODING_Iv, "Immediate of operand size") \ + ENUM_ENTRY(ENCODING_Ia, "Immediate of address size") \ + ENUM_ENTRY(ENCODING_Rv, "Register code of operand size added to the " \ + "opcode byte") \ + ENUM_ENTRY(ENCODING_DUP, "Duplicate of another operand; ID is encoded " \ + "in type") + +#define ENUM_ENTRY(n, d) n, + typedef enum { + ENCODINGS + ENCODING_max + } OperandEncoding; +#undef ENUM_ENTRY + +/* + * Semantic interpretations of instruction operands. + */ + +#define TYPES \ + ENUM_ENTRY(TYPE_NONE, "") \ + ENUM_ENTRY(TYPE_REL8, "1-byte immediate address") \ + ENUM_ENTRY(TYPE_REL16, "2-byte") \ + ENUM_ENTRY(TYPE_REL32, "4-byte") \ + ENUM_ENTRY(TYPE_REL64, "8-byte") \ + ENUM_ENTRY(TYPE_PTR1616, "2+2-byte segment+offset address") \ + ENUM_ENTRY(TYPE_PTR1632, "2+4-byte") \ + ENUM_ENTRY(TYPE_PTR1664, "2+8-byte") \ + ENUM_ENTRY(TYPE_R8, "1-byte register operand") \ + ENUM_ENTRY(TYPE_R16, "2-byte") \ + ENUM_ENTRY(TYPE_R32, "4-byte") \ + ENUM_ENTRY(TYPE_R64, "8-byte") \ + ENUM_ENTRY(TYPE_IMM8, "1-byte immediate operand") \ + ENUM_ENTRY(TYPE_IMM16, "2-byte") \ + ENUM_ENTRY(TYPE_IMM32, "4-byte") \ + ENUM_ENTRY(TYPE_IMM64, "8-byte") \ + ENUM_ENTRY(TYPE_IMM3, "1-byte immediate operand between 0 and 7") \ + ENUM_ENTRY(TYPE_RM8, "1-byte register or memory operand") \ + ENUM_ENTRY(TYPE_RM16, "2-byte") \ + ENUM_ENTRY(TYPE_RM32, "4-byte") \ + ENUM_ENTRY(TYPE_RM64, "8-byte") \ + ENUM_ENTRY(TYPE_M, "Memory operand") \ + ENUM_ENTRY(TYPE_M8, "1-byte") \ + ENUM_ENTRY(TYPE_M16, "2-byte") \ + ENUM_ENTRY(TYPE_M32, "4-byte") \ + ENUM_ENTRY(TYPE_M64, "8-byte") \ + ENUM_ENTRY(TYPE_LEA, "Effective address") \ + ENUM_ENTRY(TYPE_M128, "16-byte (SSE/SSE2)") \ + ENUM_ENTRY(TYPE_M256, "256-byte (AVX)") \ + ENUM_ENTRY(TYPE_M1616, "2+2-byte segment+offset address") \ + ENUM_ENTRY(TYPE_M1632, "2+4-byte") \ + ENUM_ENTRY(TYPE_M1664, "2+8-byte") \ + ENUM_ENTRY(TYPE_M16_32, "2+4-byte two-part memory operand (LIDT, LGDT)") \ + ENUM_ENTRY(TYPE_M16_16, "2+2-byte (BOUND)") \ + ENUM_ENTRY(TYPE_M32_32, "4+4-byte (BOUND)") \ + ENUM_ENTRY(TYPE_M16_64, "2+8-byte (LIDT, LGDT)") \ + ENUM_ENTRY(TYPE_MOFFS8, "1-byte memory offset (relative to segment " \ + "base)") \ + ENUM_ENTRY(TYPE_MOFFS16, "2-byte") \ + ENUM_ENTRY(TYPE_MOFFS32, "4-byte") \ + ENUM_ENTRY(TYPE_MOFFS64, "8-byte") \ + ENUM_ENTRY(TYPE_SREG, "Byte with single bit set: 0 = ES, 1 = CS, " \ + "2 = SS, 3 = DS, 4 = FS, 5 = GS") \ + ENUM_ENTRY(TYPE_M32FP, "32-bit IEE754 memory floating-point operand") \ + ENUM_ENTRY(TYPE_M64FP, "64-bit") \ + ENUM_ENTRY(TYPE_M80FP, "80-bit extended") \ + ENUM_ENTRY(TYPE_M16INT, "2-byte memory integer operand for use in " \ + "floating-point instructions") \ + ENUM_ENTRY(TYPE_M32INT, "4-byte") \ + ENUM_ENTRY(TYPE_M64INT, "8-byte") \ + ENUM_ENTRY(TYPE_ST, "Position on the floating-point stack") \ + ENUM_ENTRY(TYPE_MM, "MMX register operand") \ + ENUM_ENTRY(TYPE_MM32, "4-byte MMX register or memory operand") \ + ENUM_ENTRY(TYPE_MM64, "8-byte") \ + ENUM_ENTRY(TYPE_XMM, "XMM register operand") \ + ENUM_ENTRY(TYPE_XMM32, "4-byte XMM register or memory operand") \ + ENUM_ENTRY(TYPE_XMM64, "8-byte") \ + ENUM_ENTRY(TYPE_XMM128, "16-byte") \ + ENUM_ENTRY(TYPE_XMM256, "32-byte") \ + ENUM_ENTRY(TYPE_XMM0, "Implicit use of XMM0") \ + ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \ + ENUM_ENTRY(TYPE_DEBUGREG, "Debug register operand") \ + ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand") \ + \ + ENUM_ENTRY(TYPE_Mv, "Memory operand of operand size") \ + ENUM_ENTRY(TYPE_Rv, "Register operand of operand size") \ + ENUM_ENTRY(TYPE_IMMv, "Immediate operand of operand size") \ + ENUM_ENTRY(TYPE_RELv, "Immediate address of operand size") \ + ENUM_ENTRY(TYPE_DUP0, "Duplicate of operand 0") \ + ENUM_ENTRY(TYPE_DUP1, "operand 1") \ + ENUM_ENTRY(TYPE_DUP2, "operand 2") \ + ENUM_ENTRY(TYPE_DUP3, "operand 3") \ + ENUM_ENTRY(TYPE_DUP4, "operand 4") \ + ENUM_ENTRY(TYPE_M512, "512-bit FPU/MMX/XMM/MXCSR state") + +#define ENUM_ENTRY(n, d) n, +typedef enum { + TYPES + TYPE_max +} OperandType; +#undef ENUM_ENTRY + +/* + * OperandSpecifier - The specification for how to extract and interpret one + * operand. + */ +struct OperandSpecifier { + OperandEncoding encoding; + OperandType type; +}; + +/* + * Indicates where the opcode modifier (if any) is to be found. Extended + * opcodes with AddRegFrm have the opcode modifier in the ModR/M byte. + */ + +#define MODIFIER_TYPES \ + ENUM_ENTRY(MODIFIER_NONE) \ + ENUM_ENTRY(MODIFIER_OPCODE) \ + ENUM_ENTRY(MODIFIER_MODRM) + +#define ENUM_ENTRY(n) n, +typedef enum { + MODIFIER_TYPES + MODIFIER_max +} ModifierType; +#undef ENUM_ENTRY + +#define X86_MAX_OPERANDS 5 + +/* + * The specification for how to extract and interpret a full instruction and + * its operands. + */ +struct InstructionSpecifier { + ModifierType modifierType; + uint8_t modifierBase; + struct OperandSpecifier operands[X86_MAX_OPERANDS]; + + /* The macro below must be defined wherever this file is included. */ + INSTRUCTION_SPECIFIER_FIELDS +}; + +/* + * Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode + * are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode, + * respectively. + */ +typedef enum { + MODE_16BIT, + MODE_32BIT, + MODE_64BIT +} DisassemblerMode; + +#endif diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp new file mode 100644 index 0000000..029d491 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -0,0 +1,144 @@ +//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file includes code for rendering MCInst instances as AT&T-style +// assembly. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "asm-printer" +#include "X86ATTInstPrinter.h" +#include "X86InstComments.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/FormattedStream.h" +#include <map> +using namespace llvm; + +// Include the auto-generated portion of the assembly writer. +#define GET_INSTRUCTION_NAME +#define PRINT_ALIAS_INSTR +#include "X86GenAsmWriter.inc" + +X86ATTInstPrinter::X86ATTInstPrinter(const MCAsmInfo &MAI) + : MCInstPrinter(MAI) { +} + +void X86ATTInstPrinter::printRegName(raw_ostream &OS, + unsigned RegNo) const { + OS << '%' << getRegisterName(RegNo); +} + +void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot) { + // Try to print any aliases first. + if (!printAliasInstr(MI, OS)) + printInstruction(MI, OS); + + // If verbose assembly is enabled, we can print some informative comments. + if (CommentStream) { + printAnnotation(OS, Annot); + EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); + } +} + +StringRef X86ATTInstPrinter::getOpcodeName(unsigned Opcode) const { + return getInstructionName(Opcode); +} + +void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + switch (MI->getOperand(Op).getImm()) { + default: assert(0 && "Invalid ssecc argument!"); + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + } +} + +/// print_pcrel_imm - This is used to print an immediate value that ends up +/// being encoded as a pc-relative value (e.g. for jumps and calls). These +/// print slightly differently than normal immediates. For example, a $ is not +/// emitted. +void X86ATTInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) + // Print this as a signed 32-bit value. + O << (int)Op.getImm(); + else { + assert(Op.isExpr() && "unknown pcrel immediate operand"); + O << *Op.getExpr(); + } +} + +void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + O << '%' << getRegisterName(Op.getReg()); + } else if (Op.isImm()) { + // Print X86 immediates as signed values. + O << '$' << (int64_t)Op.getImm(); + + if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256)) + *CommentStream << format("imm = 0x%llX\n", (long long)Op.getImm()); + + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + O << '$' << *Op.getExpr(); + } +} + +void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &BaseReg = MI->getOperand(Op); + const MCOperand &IndexReg = MI->getOperand(Op+2); + const MCOperand &DispSpec = MI->getOperand(Op+3); + const MCOperand &SegReg = MI->getOperand(Op+4); + + // If this has a segment register, print it. + if (SegReg.getReg()) { + printOperand(MI, Op+4, O); + O << ':'; + } + + if (DispSpec.isImm()) { + int64_t DispVal = DispSpec.getImm(); + if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) + O << DispVal; + } else { + assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); + O << *DispSpec.getExpr(); + } + + if (IndexReg.getReg() || BaseReg.getReg()) { + O << '('; + if (BaseReg.getReg()) + printOperand(MI, Op, O); + + if (IndexReg.getReg()) { + O << ','; + printOperand(MI, Op+2, O); + unsigned ScaleVal = MI->getOperand(Op+1).getImm(); + if (ScaleVal != 1) + O << ',' << ScaleVal; + } + O << ')'; + } +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h new file mode 100644 index 0000000..0293869 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -0,0 +1,86 @@ +//===-- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an X86 MCInst to AT&T style .s file syntax. +// +//===----------------------------------------------------------------------===// + +#ifndef X86_ATT_INST_PRINTER_H +#define X86_ATT_INST_PRINTER_H + +#include "llvm/MC/MCInstPrinter.h" + +namespace llvm { + +class MCOperand; + +class X86ATTInstPrinter : public MCInstPrinter { +public: + X86ATTInstPrinter(const MCAsmInfo &MAI); + + virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; + virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot); + virtual StringRef getOpcodeName(unsigned Opcode) const; + + // Autogenerated by tblgen, returns true if we successfully printed an + // alias. + bool printAliasInstr(const MCInst *MI, raw_ostream &OS); + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &OS); + static const char *getRegisterName(unsigned RegNo); + static const char *getInstructionName(unsigned Opcode); + + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS); + void print_pcrel_imm(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + + void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + + void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp new file mode 100644 index 0000000..8d85b95 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -0,0 +1,287 @@ +//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This defines functionality used to emit comments about X86 instructions to +// an output stream for -fverbose-asm. +// +//===----------------------------------------------------------------------===// + +#include "X86InstComments.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "Utils/X86ShuffleDecode.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Top Level Entrypoint +//===----------------------------------------------------------------------===// + +/// EmitAnyX86InstComments - This function decodes x86 instructions and prints +/// newline terminated strings to the specified string if desired. This +/// information is shown in disassembly dumps when verbose assembly is enabled. +void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, + const char *(*getRegName)(unsigned)) { + // If this is a shuffle operation, the switch should fill in this state. + SmallVector<unsigned, 8> ShuffleMask; + const char *DestName = 0, *Src1Name = 0, *Src2Name = 0; + + switch (MI->getOpcode()) { + case X86::INSERTPSrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + Src2Name = getRegName(MI->getOperand(2).getReg()); + DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask); + break; + + case X86::MOVLHPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(0).getReg()); + DecodeMOVLHPSMask(2, ShuffleMask); + break; + + case X86::MOVHLPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(0).getReg()); + DecodeMOVHLPSMask(2, ShuffleMask); + break; + + case X86::PSHUFDri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::PSHUFDmi: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePSHUFMask(4, MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + + case X86::PSHUFHWri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::PSHUFHWmi: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePSHUFHWMask(MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + case X86::PSHUFLWri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::PSHUFLWmi: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodePSHUFLWMask(MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + + case X86::PUNPCKHBWrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKHBWrm: + Src1Name = getRegName(MI->getOperand(0).getReg()); + DecodePUNPCKHMask(16, ShuffleMask); + break; + case X86::PUNPCKHWDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKHWDrm: + Src1Name = getRegName(MI->getOperand(0).getReg()); + DecodePUNPCKHMask(8, ShuffleMask); + break; + case X86::PUNPCKHDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKHDQrm: + Src1Name = getRegName(MI->getOperand(0).getReg()); + DecodePUNPCKHMask(4, ShuffleMask); + break; + case X86::PUNPCKHQDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKHQDQrm: + Src1Name = getRegName(MI->getOperand(0).getReg()); + DecodePUNPCKHMask(2, ShuffleMask); + break; + + case X86::PUNPCKLBWrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKLBWrm: + Src1Name = getRegName(MI->getOperand(0).getReg()); + DecodePUNPCKLBWMask(16, ShuffleMask); + break; + case X86::PUNPCKLWDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKLWDrm: + Src1Name = getRegName(MI->getOperand(0).getReg()); + DecodePUNPCKLWDMask(8, ShuffleMask); + break; + case X86::PUNPCKLDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKLDQrm: + Src1Name = getRegName(MI->getOperand(0).getReg()); + DecodePUNPCKLDQMask(4, ShuffleMask); + break; + case X86::PUNPCKLQDQrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PUNPCKLQDQrm: + Src1Name = getRegName(MI->getOperand(0).getReg()); + DecodePUNPCKLQDQMask(2, ShuffleMask); + break; + + case X86::SHUFPDrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::SHUFPDrmi: + DecodeSHUFPSMask(2, MI->getOperand(3).getImm(), ShuffleMask); + Src1Name = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::SHUFPSrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::SHUFPSrmi: + DecodeSHUFPSMask(4, MI->getOperand(3).getImm(), ShuffleMask); + Src1Name = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::UNPCKLPDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::UNPCKLPDrm: + DecodeUNPCKLPDMask(2, ShuffleMask); + Src1Name = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VUNPCKLPDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPDrm: + DecodeUNPCKLPDMask(2, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; + case X86::VUNPCKLPDYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPDYrm: + DecodeUNPCKLPDMask(4, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; + case X86::UNPCKLPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::UNPCKLPSrm: + DecodeUNPCKLPSMask(4, ShuffleMask); + Src1Name = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VUNPCKLPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPSrm: + DecodeUNPCKLPSMask(4, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; + case X86::VUNPCKLPSYrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPSYrm: + DecodeUNPCKLPSMask(8, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; + case X86::UNPCKHPDrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::UNPCKHPDrm: + DecodeUNPCKHPMask(2, ShuffleMask); + Src1Name = getRegName(MI->getOperand(0).getReg()); + break; + case X86::UNPCKHPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::UNPCKHPSrm: + DecodeUNPCKHPMask(4, ShuffleMask); + Src1Name = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VPERMILPSri: + DecodeVPERMILPSMask(4, MI->getOperand(2).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VPERMILPSYri: + DecodeVPERMILPSMask(8, MI->getOperand(2).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VPERMILPDri: + DecodeVPERMILPDMask(2, MI->getOperand(2).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VPERMILPDYri: + DecodeVPERMILPDMask(4, MI->getOperand(2).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VPERM2F128rr: + DecodeVPERM2F128Mask(MI->getOperand(3).getImm(), ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + Src2Name = getRegName(MI->getOperand(2).getReg()); + break; + } + + + // If this was a shuffle operation, print the shuffle mask. + if (!ShuffleMask.empty()) { + if (DestName == 0) DestName = Src1Name; + OS << (DestName ? DestName : "mem") << " = "; + + // If the two sources are the same, canonicalize the input elements to be + // from the first src so that we get larger element spans. + if (Src1Name == Src2Name) { + for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { + if ((int)ShuffleMask[i] >= 0 && // Not sentinel. + ShuffleMask[i] >= e) // From second mask. + ShuffleMask[i] -= e; + } + } + + // The shuffle mask specifies which elements of the src1/src2 fill in the + // destination, with a few sentinel values. Loop through and print them + // out. + for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { + if (i != 0) + OS << ','; + if (ShuffleMask[i] == SM_SentinelZero) { + OS << "zero"; + continue; + } + + // Otherwise, it must come from src1 or src2. Print the span of elements + // that comes from this src. + bool isSrc1 = ShuffleMask[i] < ShuffleMask.size(); + const char *SrcName = isSrc1 ? Src1Name : Src2Name; + OS << (SrcName ? SrcName : "mem") << '['; + bool IsFirst = true; + while (i != e && + (int)ShuffleMask[i] >= 0 && + (ShuffleMask[i] < ShuffleMask.size()) == isSrc1) { + if (!IsFirst) + OS << ','; + else + IsFirst = false; + OS << ShuffleMask[i] % ShuffleMask.size(); + ++i; + } + OS << ']'; + --i; // For loop increments element #. + } + //MI->print(OS, 0); + OS << "\n"; + } + +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h new file mode 100644 index 0000000..6b86db4 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h @@ -0,0 +1,25 @@ +//===-- X86InstComments.h - Generate verbose-asm comments for instrs ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This defines functionality used to emit comments about X86 instructions to +// an output stream for -fverbose-asm. +// +//===----------------------------------------------------------------------===// + +#ifndef X86_INST_COMMENTS_H +#define X86_INST_COMMENTS_H + +namespace llvm { + class MCInst; + class raw_ostream; + void EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, + const char *(*getRegName)(unsigned)); +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp new file mode 100644 index 0000000..f9ab5ae --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -0,0 +1,146 @@ +//===-- X86IntelInstPrinter.cpp - AT&T assembly instruction printing ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file includes code for rendering MCInst instances as AT&T-style +// assembly. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "asm-printer" +#include "X86IntelInstPrinter.h" +#include "X86InstComments.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormattedStream.h" +#include <cctype> +using namespace llvm; + +// Include the auto-generated portion of the assembly writer. +#define GET_INSTRUCTION_NAME +#include "X86GenAsmWriter1.inc" + +void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + OS << getRegisterName(RegNo); +} + +void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot) { + printInstruction(MI, OS); + + // If verbose assembly is enabled, we can print some informative comments. + if (CommentStream) { + printAnnotation(OS, Annot); + EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); + } +} +StringRef X86IntelInstPrinter::getOpcodeName(unsigned Opcode) const { + return getInstructionName(Opcode); +} + +void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + switch (MI->getOperand(Op).getImm()) { + default: assert(0 && "Invalid ssecc argument!"); + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + } +} + +/// print_pcrel_imm - This is used to print an immediate value that ends up +/// being encoded as a pc-relative value. +void X86IntelInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isImm()) + O << Op.getImm(); + else { + assert(Op.isExpr() && "unknown pcrel immediate operand"); + O << *Op.getExpr(); + } +} + +static void PrintRegName(raw_ostream &O, StringRef RegName) { + for (unsigned i = 0, e = RegName.size(); i != e; ++i) + O << (char)toupper(RegName[i]); +} + +void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + PrintRegName(O, getRegisterName(Op.getReg())); + } else if (Op.isImm()) { + O << Op.getImm(); + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + O << *Op.getExpr(); + } +} + +void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, + raw_ostream &O) { + const MCOperand &BaseReg = MI->getOperand(Op); + unsigned ScaleVal = MI->getOperand(Op+1).getImm(); + const MCOperand &IndexReg = MI->getOperand(Op+2); + const MCOperand &DispSpec = MI->getOperand(Op+3); + const MCOperand &SegReg = MI->getOperand(Op+4); + + // If this has a segment register, print it. + if (SegReg.getReg()) { + printOperand(MI, Op+4, O); + O << ':'; + } + + O << '['; + + bool NeedPlus = false; + if (BaseReg.getReg()) { + printOperand(MI, Op, O); + NeedPlus = true; + } + + if (IndexReg.getReg()) { + if (NeedPlus) O << " + "; + if (ScaleVal != 1) + O << ScaleVal << '*'; + printOperand(MI, Op+2, O); + NeedPlus = true; + } + + + if (!DispSpec.isImm()) { + if (NeedPlus) O << " + "; + assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); + O << *DispSpec.getExpr(); + } else { + int64_t DispVal = DispSpec.getImm(); + if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) { + if (NeedPlus) { + if (DispVal > 0) + O << " + "; + else { + O << " - "; + DispVal = -DispVal; + } + } + O << DispVal; + } + } + + O << ']'; +} diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h new file mode 100644 index 0000000..6d5ec62 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -0,0 +1,96 @@ +//===-- X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an X86 MCInst to intel style .s file syntax. +// +//===----------------------------------------------------------------------===// + +#ifndef X86_INTEL_INST_PRINTER_H +#define X86_INTEL_INST_PRINTER_H + +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/Support/raw_ostream.h" + +namespace llvm { + +class MCOperand; + +class X86IntelInstPrinter : public MCInstPrinter { +public: + X86IntelInstPrinter(const MCAsmInfo &MAI) + : MCInstPrinter(MAI) {} + + virtual void printRegName(raw_ostream &OS, unsigned RegNo) const; + virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot); + virtual StringRef getOpcodeName(unsigned Opcode) const; + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); + static const char *getInstructionName(unsigned Opcode); + + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O); + void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O); + void print_pcrel_imm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "OPAQUE PTR "; + printMemReference(MI, OpNo, O); + } + + void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "BYTE PTR "; + printMemReference(MI, OpNo, O); + } + void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "WORD PTR "; + printMemReference(MI, OpNo, O); + } + void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "DWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "QWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "XMMWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "YMMWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "DWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "QWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "XWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "XMMWORD PTR "; + printMemReference(MI, OpNo, O); + } + void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + O << "YMMWORD PTR "; + printMemReference(MI, OpNo, O); + } +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp new file mode 100644 index 0000000..69ad7d7 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -0,0 +1,458 @@ +//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/MC/MCAsmBackend.h" +#include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86FixupKinds.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSectionCOFF.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/Object/MachOFormat.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +// Option to allow disabling arithmetic relaxation to workaround PR9807, which +// is useful when running bitwise comparison experiments on Darwin. We should be +// able to remove this once PR9807 is resolved. +static cl::opt<bool> +MCDisableArithRelaxation("mc-x86-disable-arith-relaxation", + cl::desc("Disable relaxation of arithmetic instruction for X86")); + +static unsigned getFixupKindLog2Size(unsigned Kind) { + switch (Kind) { + default: assert(0 && "invalid fixup kind!"); + case FK_PCRel_1: + case FK_Data_1: return 0; + case FK_PCRel_2: + case FK_Data_2: return 1; + case FK_PCRel_4: + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + case X86::reloc_signed_4byte: + case X86::reloc_global_offset_table: + case FK_Data_4: return 2; + case FK_PCRel_8: + case FK_Data_8: return 3; + } +} + +namespace { + +class X86ELFObjectWriter : public MCELFObjectTargetWriter { +public: + X86ELFObjectWriter(bool is64Bit, Triple::OSType OSType, uint16_t EMachine, + bool HasRelocationAddend) + : MCELFObjectTargetWriter(is64Bit, OSType, EMachine, HasRelocationAddend) {} +}; + +class X86AsmBackend : public MCAsmBackend { +public: + X86AsmBackend(const Target &T) + : MCAsmBackend() {} + + unsigned getNumFixupKinds() const { + return X86::NumTargetFixupKinds; + } + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const { + const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = { + { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel }, + { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel}, + { "reloc_signed_4byte", 0, 4 * 8, 0}, + { "reloc_global_offset_table", 0, 4 * 8, 0} + }; + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && + "Invalid kind!"); + return Infos[Kind - FirstTargetFixupKind]; + } + + void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value) const { + unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind()); + + assert(Fixup.getOffset() + Size <= DataSize && + "Invalid fixup offset!"); + + // Check that uppper bits are either all zeros or all ones. + // Specifically ignore overflow/underflow as long as the leakage is + // limited to the lower bits. This is to remain compatible with + // other assemblers. + assert(isIntN(Size * 8 + 1, Value) && + "Value does not fit in the Fixup field"); + + for (unsigned i = 0; i != Size; ++i) + Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8)); + } + + bool MayNeedRelaxation(const MCInst &Inst) const; + + void RelaxInstruction(const MCInst &Inst, MCInst &Res) const; + + bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const; +}; +} // end anonymous namespace + +static unsigned getRelaxedOpcodeBranch(unsigned Op) { + switch (Op) { + default: + return Op; + + case X86::JAE_1: return X86::JAE_4; + case X86::JA_1: return X86::JA_4; + case X86::JBE_1: return X86::JBE_4; + case X86::JB_1: return X86::JB_4; + case X86::JE_1: return X86::JE_4; + case X86::JGE_1: return X86::JGE_4; + case X86::JG_1: return X86::JG_4; + case X86::JLE_1: return X86::JLE_4; + case X86::JL_1: return X86::JL_4; + case X86::JMP_1: return X86::JMP_4; + case X86::JNE_1: return X86::JNE_4; + case X86::JNO_1: return X86::JNO_4; + case X86::JNP_1: return X86::JNP_4; + case X86::JNS_1: return X86::JNS_4; + case X86::JO_1: return X86::JO_4; + case X86::JP_1: return X86::JP_4; + case X86::JS_1: return X86::JS_4; + } +} + +static unsigned getRelaxedOpcodeArith(unsigned Op) { + switch (Op) { + default: + return Op; + + // IMUL + case X86::IMUL16rri8: return X86::IMUL16rri; + case X86::IMUL16rmi8: return X86::IMUL16rmi; + case X86::IMUL32rri8: return X86::IMUL32rri; + case X86::IMUL32rmi8: return X86::IMUL32rmi; + case X86::IMUL64rri8: return X86::IMUL64rri32; + case X86::IMUL64rmi8: return X86::IMUL64rmi32; + + // AND + case X86::AND16ri8: return X86::AND16ri; + case X86::AND16mi8: return X86::AND16mi; + case X86::AND32ri8: return X86::AND32ri; + case X86::AND32mi8: return X86::AND32mi; + case X86::AND64ri8: return X86::AND64ri32; + case X86::AND64mi8: return X86::AND64mi32; + + // OR + case X86::OR16ri8: return X86::OR16ri; + case X86::OR16mi8: return X86::OR16mi; + case X86::OR32ri8: return X86::OR32ri; + case X86::OR32mi8: return X86::OR32mi; + case X86::OR64ri8: return X86::OR64ri32; + case X86::OR64mi8: return X86::OR64mi32; + + // XOR + case X86::XOR16ri8: return X86::XOR16ri; + case X86::XOR16mi8: return X86::XOR16mi; + case X86::XOR32ri8: return X86::XOR32ri; + case X86::XOR32mi8: return X86::XOR32mi; + case X86::XOR64ri8: return X86::XOR64ri32; + case X86::XOR64mi8: return X86::XOR64mi32; + + // ADD + case X86::ADD16ri8: return X86::ADD16ri; + case X86::ADD16mi8: return X86::ADD16mi; + case X86::ADD32ri8: return X86::ADD32ri; + case X86::ADD32mi8: return X86::ADD32mi; + case X86::ADD64ri8: return X86::ADD64ri32; + case X86::ADD64mi8: return X86::ADD64mi32; + + // SUB + case X86::SUB16ri8: return X86::SUB16ri; + case X86::SUB16mi8: return X86::SUB16mi; + case X86::SUB32ri8: return X86::SUB32ri; + case X86::SUB32mi8: return X86::SUB32mi; + case X86::SUB64ri8: return X86::SUB64ri32; + case X86::SUB64mi8: return X86::SUB64mi32; + + // CMP + case X86::CMP16ri8: return X86::CMP16ri; + case X86::CMP16mi8: return X86::CMP16mi; + case X86::CMP32ri8: return X86::CMP32ri; + case X86::CMP32mi8: return X86::CMP32mi; + case X86::CMP64ri8: return X86::CMP64ri32; + case X86::CMP64mi8: return X86::CMP64mi32; + + // PUSH + case X86::PUSHi8: return X86::PUSHi32; + case X86::PUSHi16: return X86::PUSHi32; + case X86::PUSH64i8: return X86::PUSH64i32; + case X86::PUSH64i16: return X86::PUSH64i32; + } +} + +static unsigned getRelaxedOpcode(unsigned Op) { + unsigned R = getRelaxedOpcodeArith(Op); + if (R != Op) + return R; + return getRelaxedOpcodeBranch(Op); +} + +bool X86AsmBackend::MayNeedRelaxation(const MCInst &Inst) const { + // Branches can always be relaxed. + if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode()) + return true; + + if (MCDisableArithRelaxation) + return false; + + // Check if this instruction is ever relaxable. + if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode()) + return false; + + + // Check if it has an expression and is not RIP relative. + bool hasExp = false; + bool hasRIP = false; + for (unsigned i = 0; i < Inst.getNumOperands(); ++i) { + const MCOperand &Op = Inst.getOperand(i); + if (Op.isExpr()) + hasExp = true; + + if (Op.isReg() && Op.getReg() == X86::RIP) + hasRIP = true; + } + + // FIXME: Why exactly do we need the !hasRIP? Is it just a limitation on + // how we do relaxations? + return hasExp && !hasRIP; +} + +// FIXME: Can tblgen help at all here to verify there aren't other instructions +// we can relax? +void X86AsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const { + // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel. + unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode()); + + if (RelaxedOp == Inst.getOpcode()) { + SmallString<256> Tmp; + raw_svector_ostream OS(Tmp); + Inst.dump_pretty(OS); + OS << "\n"; + report_fatal_error("unexpected instruction to relax: " + OS.str()); + } + + Res = Inst; + Res.setOpcode(RelaxedOp); +} + +/// WriteNopData - Write optimal nops to the output file for the \arg Count +/// bytes. This returns the number of bytes written. It may return 0 if +/// the \arg Count is more than the maximum optimal nops. +bool X86AsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const { + static const uint8_t Nops[10][10] = { + // nop + {0x90}, + // xchg %ax,%ax + {0x66, 0x90}, + // nopl (%[re]ax) + {0x0f, 0x1f, 0x00}, + // nopl 0(%[re]ax) + {0x0f, 0x1f, 0x40, 0x00}, + // nopl 0(%[re]ax,%[re]ax,1) + {0x0f, 0x1f, 0x44, 0x00, 0x00}, + // nopw 0(%[re]ax,%[re]ax,1) + {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00}, + // nopl 0L(%[re]ax) + {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00}, + // nopl 0L(%[re]ax,%[re]ax,1) + {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + // nopw 0L(%[re]ax,%[re]ax,1) + {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + // nopw %cs:0L(%[re]ax,%[re]ax,1) + {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + }; + + // Write an optimal sequence for the first 15 bytes. + const uint64_t OptimalCount = (Count < 16) ? Count : 15; + const uint64_t Prefixes = OptimalCount <= 10 ? 0 : OptimalCount - 10; + for (uint64_t i = 0, e = Prefixes; i != e; i++) + OW->Write8(0x66); + const uint64_t Rest = OptimalCount - Prefixes; + for (uint64_t i = 0, e = Rest; i != e; i++) + OW->Write8(Nops[Rest - 1][i]); + + // Finish with single byte nops. + for (uint64_t i = OptimalCount, e = Count; i != e; ++i) + OW->Write8(0x90); + + return true; +} + +/* *** */ + +namespace { +class ELFX86AsmBackend : public X86AsmBackend { +public: + Triple::OSType OSType; + ELFX86AsmBackend(const Target &T, Triple::OSType _OSType) + : X86AsmBackend(T), OSType(_OSType) { + HasReliableSymbolDifference = true; + } + + virtual bool doesSectionRequireSymbols(const MCSection &Section) const { + const MCSectionELF &ES = static_cast<const MCSectionELF&>(Section); + return ES.getFlags() & ELF::SHF_MERGE; + } +}; + +class ELFX86_32AsmBackend : public ELFX86AsmBackend { +public: + ELFX86_32AsmBackend(const Target &T, Triple::OSType OSType) + : ELFX86AsmBackend(T, OSType) {} + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + return createELFObjectWriter(createELFObjectTargetWriter(), + OS, /*IsLittleEndian*/ true); + } + + MCELFObjectTargetWriter *createELFObjectTargetWriter() const { + return new X86ELFObjectWriter(false, OSType, ELF::EM_386, false); + } +}; + +class ELFX86_64AsmBackend : public ELFX86AsmBackend { +public: + ELFX86_64AsmBackend(const Target &T, Triple::OSType OSType) + : ELFX86AsmBackend(T, OSType) {} + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + return createELFObjectWriter(createELFObjectTargetWriter(), + OS, /*IsLittleEndian*/ true); + } + + MCELFObjectTargetWriter *createELFObjectTargetWriter() const { + return new X86ELFObjectWriter(true, OSType, ELF::EM_X86_64, true); + } +}; + +class WindowsX86AsmBackend : public X86AsmBackend { + bool Is64Bit; + +public: + WindowsX86AsmBackend(const Target &T, bool is64Bit) + : X86AsmBackend(T) + , Is64Bit(is64Bit) { + } + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + return createWinCOFFObjectWriter(OS, Is64Bit); + } +}; + +class DarwinX86AsmBackend : public X86AsmBackend { +public: + DarwinX86AsmBackend(const Target &T) + : X86AsmBackend(T) { } +}; + +class DarwinX86_32AsmBackend : public DarwinX86AsmBackend { +public: + DarwinX86_32AsmBackend(const Target &T) + : DarwinX86AsmBackend(T) {} + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + return createX86MachObjectWriter(OS, /*Is64Bit=*/false, + object::mach::CTM_i386, + object::mach::CSX86_ALL); + } +}; + +class DarwinX86_64AsmBackend : public DarwinX86AsmBackend { +public: + DarwinX86_64AsmBackend(const Target &T) + : DarwinX86AsmBackend(T) { + HasReliableSymbolDifference = true; + } + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const { + return createX86MachObjectWriter(OS, /*Is64Bit=*/true, + object::mach::CTM_x86_64, + object::mach::CSX86_ALL); + } + + virtual bool doesSectionRequireSymbols(const MCSection &Section) const { + // Temporary labels in the string literals sections require symbols. The + // issue is that the x86_64 relocation format does not allow symbol + + // offset, and so the linker does not have enough information to resolve the + // access to the appropriate atom unless an external relocation is used. For + // non-cstring sections, we expect the compiler to use a non-temporary label + // for anything that could have an addend pointing outside the symbol. + // + // See <rdar://problem/4765733>. + const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section); + return SMO.getType() == MCSectionMachO::S_CSTRING_LITERALS; + } + + virtual bool isSectionAtomizable(const MCSection &Section) const { + const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section); + // Fixed sized data sections are uniqued, they cannot be diced into atoms. + switch (SMO.getType()) { + default: + return true; + + case MCSectionMachO::S_4BYTE_LITERALS: + case MCSectionMachO::S_8BYTE_LITERALS: + case MCSectionMachO::S_16BYTE_LITERALS: + case MCSectionMachO::S_LITERAL_POINTERS: + case MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS: + case MCSectionMachO::S_LAZY_SYMBOL_POINTERS: + case MCSectionMachO::S_MOD_INIT_FUNC_POINTERS: + case MCSectionMachO::S_MOD_TERM_FUNC_POINTERS: + case MCSectionMachO::S_INTERPOSING: + return false; + } + } +}; + +} // end anonymous namespace + +MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT) { + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) + return new DarwinX86_32AsmBackend(T); + + if (TheTriple.isOSWindows()) + return new WindowsX86AsmBackend(T, false); + + return new ELFX86_32AsmBackend(T, TheTriple.getOS()); +} + +MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT) { + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) + return new DarwinX86_64AsmBackend(T); + + if (TheTriple.isOSWindows()) + return new WindowsX86AsmBackend(T, true); + + return new ELFX86_64AsmBackend(T, TheTriple.getOS()); +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h new file mode 100644 index 0000000..e6ba705 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -0,0 +1,548 @@ +//===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains small standalone helper functions and enum definitions for +// the X86 target useful for the compiler back-end and the MC libraries. +// As such, it deliberately does not include references to LLVM core +// code gen types, passes, etc.. +// +//===----------------------------------------------------------------------===// + +#ifndef X86BASEINFO_H +#define X86BASEINFO_H + +#include "X86MCTargetDesc.h" +#include "llvm/Support/DataTypes.h" +#include <cassert> + +namespace llvm { + +namespace X86 { + // Enums for memory operand decoding. Each memory operand is represented with + // a 5 operand sequence in the form: + // [BaseReg, ScaleAmt, IndexReg, Disp, Segment] + // These enums help decode this. + enum { + AddrBaseReg = 0, + AddrScaleAmt = 1, + AddrIndexReg = 2, + AddrDisp = 3, + + /// AddrSegmentReg - The operand # of the segment in the memory operand. + AddrSegmentReg = 4, + + /// AddrNumOperands - Total number of operands in a memory reference. + AddrNumOperands = 5 + }; +} // end namespace X86; + + +/// X86II - This namespace holds all of the target specific flags that +/// instruction info tracks. +/// +namespace X86II { + /// Target Operand Flag enum. + enum TOF { + //===------------------------------------------------------------------===// + // X86 Specific MachineOperand flags. + + MO_NO_FLAG, + + /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a + /// relocation of: + /// SYMBOL_LABEL + [. - PICBASELABEL] + MO_GOT_ABSOLUTE_ADDRESS, + + /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the + /// immediate should get the value of the symbol minus the PIC base label: + /// SYMBOL_LABEL - PICBASELABEL + MO_PIC_BASE_OFFSET, + + /// MO_GOT - On a symbol operand this indicates that the immediate is the + /// offset to the GOT entry for the symbol name from the base of the GOT. + /// + /// See the X86-64 ELF ABI supplement for more details. + /// SYMBOL_LABEL @GOT + MO_GOT, + + /// MO_GOTOFF - On a symbol operand this indicates that the immediate is + /// the offset to the location of the symbol name from the base of the GOT. + /// + /// See the X86-64 ELF ABI supplement for more details. + /// SYMBOL_LABEL @GOTOFF + MO_GOTOFF, + + /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is + /// offset to the GOT entry for the symbol name from the current code + /// location. + /// + /// See the X86-64 ELF ABI supplement for more details. + /// SYMBOL_LABEL @GOTPCREL + MO_GOTPCREL, + + /// MO_PLT - On a symbol operand this indicates that the immediate is + /// offset to the PLT entry of symbol name from the current code location. + /// + /// See the X86-64 ELF ABI supplement for more details. + /// SYMBOL_LABEL @PLT + MO_PLT, + + /// MO_TLSGD - On a symbol operand this indicates that the immediate is + /// some TLS offset. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @TLSGD + MO_TLSGD, + + /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is + /// some TLS offset. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @GOTTPOFF + MO_GOTTPOFF, + + /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is + /// some TLS offset. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @INDNTPOFF + MO_INDNTPOFF, + + /// MO_TPOFF - On a symbol operand this indicates that the immediate is + /// some TLS offset. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @TPOFF + MO_TPOFF, + + /// MO_NTPOFF - On a symbol operand this indicates that the immediate is + /// some TLS offset. + /// + /// See 'ELF Handling for Thread-Local Storage' for more details. + /// SYMBOL_LABEL @NTPOFF + MO_NTPOFF, + + /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the + /// reference is actually to the "__imp_FOO" symbol. This is used for + /// dllimport linkage on windows. + MO_DLLIMPORT, + + /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the + /// reference is actually to the "FOO$stub" symbol. This is used for calls + /// and jumps to external functions on Tiger and earlier. + MO_DARWIN_STUB, + + /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the + /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a + /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub. + MO_DARWIN_NONLAZY, + + /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates + /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is + /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub. + MO_DARWIN_NONLAZY_PIC_BASE, + + /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this + /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE", + /// which is a PIC-base-relative reference to a hidden dyld lazy pointer + /// stub. + MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, + + /// MO_TLVP - On a symbol operand this indicates that the immediate is + /// some TLS offset. + /// + /// This is the TLS offset for the Darwin TLS mechanism. + MO_TLVP, + + /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate + /// is some TLS offset from the picbase. + /// + /// This is the 32-bit TLS offset for Darwin TLS in PIC mode. + MO_TLVP_PIC_BASE + }; + + enum { + //===------------------------------------------------------------------===// + // Instruction encodings. These are the standard/most common forms for X86 + // instructions. + // + + // PseudoFrm - This represents an instruction that is a pseudo instruction + // or one that has not been implemented yet. It is illegal to code generate + // it, but tolerated for intermediate implementation stages. + Pseudo = 0, + + /// Raw - This form is for instructions that don't have any operands, so + /// they are just a fixed opcode value, like 'leave'. + RawFrm = 1, + + /// AddRegFrm - This form is used for instructions like 'push r32' that have + /// their one register operand added to their opcode. + AddRegFrm = 2, + + /// MRMDestReg - This form is used for instructions that use the Mod/RM byte + /// to specify a destination, which in this case is a register. + /// + MRMDestReg = 3, + + /// MRMDestMem - This form is used for instructions that use the Mod/RM byte + /// to specify a destination, which in this case is memory. + /// + MRMDestMem = 4, + + /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte + /// to specify a source, which in this case is a register. + /// + MRMSrcReg = 5, + + /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte + /// to specify a source, which in this case is memory. + /// + MRMSrcMem = 6, + + /// MRM[0-7][rm] - These forms are used to represent instructions that use + /// a Mod/RM byte, and use the middle field to hold extended opcode + /// information. In the intel manual these are represented as /0, /1, ... + /// + + // First, instructions that operate on a register r/m operand... + MRM0r = 16, MRM1r = 17, MRM2r = 18, MRM3r = 19, // Format /0 /1 /2 /3 + MRM4r = 20, MRM5r = 21, MRM6r = 22, MRM7r = 23, // Format /4 /5 /6 /7 + + // Next, instructions that operate on a memory r/m operand... + MRM0m = 24, MRM1m = 25, MRM2m = 26, MRM3m = 27, // Format /0 /1 /2 /3 + MRM4m = 28, MRM5m = 29, MRM6m = 30, MRM7m = 31, // Format /4 /5 /6 /7 + + // MRMInitReg - This form is used for instructions whose source and + // destinations are the same register. + MRMInitReg = 32, + + //// MRM_C1 - A mod/rm byte of exactly 0xC1. + MRM_C1 = 33, + MRM_C2 = 34, + MRM_C3 = 35, + MRM_C4 = 36, + MRM_C8 = 37, + MRM_C9 = 38, + MRM_E8 = 39, + MRM_F0 = 40, + MRM_F8 = 41, + MRM_F9 = 42, + MRM_D0 = 45, + MRM_D1 = 46, + + /// RawFrmImm8 - This is used for the ENTER instruction, which has two + /// immediates, the first of which is a 16-bit immediate (specified by + /// the imm encoding) and the second is a 8-bit fixed value. + RawFrmImm8 = 43, + + /// RawFrmImm16 - This is used for CALL FAR instructions, which have two + /// immediates, the first of which is a 16 or 32-bit immediate (specified by + /// the imm encoding) and the second is a 16-bit fixed value. In the AMD + /// manual, this operand is described as pntr16:32 and pntr16:16 + RawFrmImm16 = 44, + + FormMask = 63, + + //===------------------------------------------------------------------===// + // Actual flags... + + // OpSize - Set if this instruction requires an operand size prefix (0x66), + // which most often indicates that the instruction operates on 16 bit data + // instead of 32 bit data. + OpSize = 1 << 6, + + // AsSize - Set if this instruction requires an operand size prefix (0x67), + // which most often indicates that the instruction address 16 bit address + // instead of 32 bit address (or 32 bit address in 64 bit mode). + AdSize = 1 << 7, + + //===------------------------------------------------------------------===// + // Op0Mask - There are several prefix bytes that are used to form two byte + // opcodes. These are currently 0x0F, 0xF3, and 0xD8-0xDF. This mask is + // used to obtain the setting of this field. If no bits in this field is + // set, there is no prefix byte for obtaining a multibyte opcode. + // + Op0Shift = 8, + Op0Mask = 0x1F << Op0Shift, + + // TB - TwoByte - Set if this instruction has a two byte opcode, which + // starts with a 0x0F byte before the real opcode. + TB = 1 << Op0Shift, + + // REP - The 0xF3 prefix byte indicating repetition of the following + // instruction. + REP = 2 << Op0Shift, + + // D8-DF - These escape opcodes are used by the floating point unit. These + // values must remain sequential. + D8 = 3 << Op0Shift, D9 = 4 << Op0Shift, + DA = 5 << Op0Shift, DB = 6 << Op0Shift, + DC = 7 << Op0Shift, DD = 8 << Op0Shift, + DE = 9 << Op0Shift, DF = 10 << Op0Shift, + + // XS, XD - These prefix codes are for single and double precision scalar + // floating point operations performed in the SSE registers. + XD = 11 << Op0Shift, XS = 12 << Op0Shift, + + // T8, TA, A6, A7 - Prefix after the 0x0F prefix. + T8 = 13 << Op0Shift, TA = 14 << Op0Shift, + A6 = 15 << Op0Shift, A7 = 16 << Op0Shift, + + // TF - Prefix before and after 0x0F + TF = 17 << Op0Shift, + + //===------------------------------------------------------------------===// + // REX_W - REX prefixes are instruction prefixes used in 64-bit mode. + // They are used to specify GPRs and SSE registers, 64-bit operand size, + // etc. We only cares about REX.W and REX.R bits and only the former is + // statically determined. + // + REXShift = Op0Shift + 5, + REX_W = 1 << REXShift, + + //===------------------------------------------------------------------===// + // This three-bit field describes the size of an immediate operand. Zero is + // unused so that we can tell if we forgot to set a value. + ImmShift = REXShift + 1, + ImmMask = 7 << ImmShift, + Imm8 = 1 << ImmShift, + Imm8PCRel = 2 << ImmShift, + Imm16 = 3 << ImmShift, + Imm16PCRel = 4 << ImmShift, + Imm32 = 5 << ImmShift, + Imm32PCRel = 6 << ImmShift, + Imm64 = 7 << ImmShift, + + //===------------------------------------------------------------------===// + // FP Instruction Classification... Zero is non-fp instruction. + + // FPTypeMask - Mask for all of the FP types... + FPTypeShift = ImmShift + 3, + FPTypeMask = 7 << FPTypeShift, + + // NotFP - The default, set for instructions that do not use FP registers. + NotFP = 0 << FPTypeShift, + + // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0 + ZeroArgFP = 1 << FPTypeShift, + + // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst + OneArgFP = 2 << FPTypeShift, + + // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a + // result back to ST(0). For example, fcos, fsqrt, etc. + // + OneArgFPRW = 3 << FPTypeShift, + + // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an + // explicit argument, storing the result to either ST(0) or the implicit + // argument. For example: fadd, fsub, fmul, etc... + TwoArgFP = 4 << FPTypeShift, + + // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an + // explicit argument, but have no destination. Example: fucom, fucomi, ... + CompareFP = 5 << FPTypeShift, + + // CondMovFP - "2 operand" floating point conditional move instructions. + CondMovFP = 6 << FPTypeShift, + + // SpecialFP - Special instruction forms. Dispatch by opcode explicitly. + SpecialFP = 7 << FPTypeShift, + + // Lock prefix + LOCKShift = FPTypeShift + 3, + LOCK = 1 << LOCKShift, + + // Segment override prefixes. Currently we just need ability to address + // stuff in gs and fs segments. + SegOvrShift = LOCKShift + 1, + SegOvrMask = 3 << SegOvrShift, + FS = 1 << SegOvrShift, + GS = 2 << SegOvrShift, + + // Execution domain for SSE instructions in bits 23, 24. + // 0 in bits 23-24 means normal, non-SSE instruction. + SSEDomainShift = SegOvrShift + 2, + + OpcodeShift = SSEDomainShift + 2, + + //===------------------------------------------------------------------===// + /// VEX - The opcode prefix used by AVX instructions + VEXShift = OpcodeShift + 8, + VEX = 1U << 0, + + /// VEX_W - Has a opcode specific functionality, but is used in the same + /// way as REX_W is for regular SSE instructions. + VEX_W = 1U << 1, + + /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2 + /// address instructions in SSE are represented as 3 address ones in AVX + /// and the additional register is encoded in VEX_VVVV prefix. + VEX_4V = 1U << 2, + + /// VEX_I8IMM - Specifies that the last register used in a AVX instruction, + /// must be encoded in the i8 immediate field. This usually happens in + /// instructions with 4 operands. + VEX_I8IMM = 1U << 3, + + /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current + /// instruction uses 256-bit wide registers. This is usually auto detected + /// if a VR256 register is used, but some AVX instructions also have this + /// field marked when using a f256 memory references. + VEX_L = 1U << 4, + + // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX + // prefix. Usually used for scalar instructions. Needed by disassembler. + VEX_LIG = 1U << 5, + + /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the + /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents + /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction + /// storing a classifier in the imm8 field. To simplify our implementation, + /// we handle this by storeing the classifier in the opcode field and using + /// this flag to indicate that the encoder should do the wacky 3DNow! thing. + Has3DNow0F0FOpcode = 1U << 6 + }; + + // getBaseOpcodeFor - This function returns the "base" X86 opcode for the + // specified machine instruction. + // + static inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) { + return TSFlags >> X86II::OpcodeShift; + } + + static inline bool hasImm(uint64_t TSFlags) { + return (TSFlags & X86II::ImmMask) != 0; + } + + /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field + /// of the specified instruction. + static inline unsigned getSizeOfImm(uint64_t TSFlags) { + switch (TSFlags & X86II::ImmMask) { + default: assert(0 && "Unknown immediate size"); + case X86II::Imm8: + case X86II::Imm8PCRel: return 1; + case X86II::Imm16: + case X86II::Imm16PCRel: return 2; + case X86II::Imm32: + case X86II::Imm32PCRel: return 4; + case X86II::Imm64: return 8; + } + } + + /// isImmPCRel - Return true if the immediate of the specified instruction's + /// TSFlags indicates that it is pc relative. + static inline unsigned isImmPCRel(uint64_t TSFlags) { + switch (TSFlags & X86II::ImmMask) { + default: assert(0 && "Unknown immediate size"); + case X86II::Imm8PCRel: + case X86II::Imm16PCRel: + case X86II::Imm32PCRel: + return true; + case X86II::Imm8: + case X86II::Imm16: + case X86II::Imm32: + case X86II::Imm64: + return false; + } + } + + /// getMemoryOperandNo - The function returns the MCInst operand # for the + /// first field of the memory operand. If the instruction doesn't have a + /// memory operand, this returns -1. + /// + /// Note that this ignores tied operands. If there is a tied register which + /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only + /// counted as one operand. + /// + static inline int getMemoryOperandNo(uint64_t TSFlags) { + switch (TSFlags & X86II::FormMask) { + case X86II::MRMInitReg: assert(0 && "FIXME: Remove this form"); + default: assert(0 && "Unknown FormMask value in getMemoryOperandNo!"); + case X86II::Pseudo: + case X86II::RawFrm: + case X86II::AddRegFrm: + case X86II::MRMDestReg: + case X86II::MRMSrcReg: + case X86II::RawFrmImm8: + case X86II::RawFrmImm16: + return -1; + case X86II::MRMDestMem: + return 0; + case X86II::MRMSrcMem: { + bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + unsigned FirstMemOp = 1; + if (HasVEX_4V) + ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV). + + // FIXME: Maybe lea should have its own form? This is a horrible hack. + //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r || + // Opcode == X86::LEA16r || Opcode == X86::LEA32r) + return FirstMemOp; + } + case X86II::MRM0r: case X86II::MRM1r: + case X86II::MRM2r: case X86II::MRM3r: + case X86II::MRM4r: case X86II::MRM5r: + case X86II::MRM6r: case X86II::MRM7r: + return -1; + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: + return 0; + case X86II::MRM_C1: + case X86II::MRM_C2: + case X86II::MRM_C3: + case X86II::MRM_C4: + case X86II::MRM_C8: + case X86II::MRM_C9: + case X86II::MRM_E8: + case X86II::MRM_F0: + case X86II::MRM_F8: + case X86II::MRM_F9: + case X86II::MRM_D0: + case X86II::MRM_D1: + return -1; + } + } + + /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or + /// higher) register? e.g. r8, xmm8, xmm13, etc. + static inline bool isX86_64ExtendedReg(unsigned RegNo) { + switch (RegNo) { + default: break; + case X86::R8: case X86::R9: case X86::R10: case X86::R11: + case X86::R12: case X86::R13: case X86::R14: case X86::R15: + case X86::R8D: case X86::R9D: case X86::R10D: case X86::R11D: + case X86::R12D: case X86::R13D: case X86::R14D: case X86::R15D: + case X86::R8W: case X86::R9W: case X86::R10W: case X86::R11W: + case X86::R12W: case X86::R13W: case X86::R14W: case X86::R15W: + case X86::R8B: case X86::R9B: case X86::R10B: case X86::R11B: + case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B: + case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11: + case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15: + case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11: + case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15: + case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11: + case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15: + return true; + } + return false; + } + + static inline bool isX86_64NonExtLowByteReg(unsigned reg) { + return (reg == X86::SPL || reg == X86::BPL || + reg == X86::SIL || reg == X86::DIL); + } +} + +} // end namespace llvm; + +#endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h new file mode 100644 index 0000000..17d242a --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h @@ -0,0 +1,33 @@ +//===-- X86/X86FixupKinds.h - X86 Specific Fixup Entries --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_X86_X86FIXUPKINDS_H +#define LLVM_X86_X86FIXUPKINDS_H + +#include "llvm/MC/MCFixup.h" + +namespace llvm { +namespace X86 { +enum Fixups { + reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative + reloc_riprel_4byte_movq_load, // 32-bit rip-relative in movq + reloc_signed_4byte, // 32-bit signed. Unlike FK_Data_4 + // this will be sign extended at + // runtime. + reloc_global_offset_table, // 32-bit, relative to the start + // of the instruction. Used only + // for _GLOBAL_OFFSET_TABLE_. + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind +}; +} +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp new file mode 100644 index 0000000..2703100 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -0,0 +1,138 @@ +//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the X86MCAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "X86MCAsmInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ELF.h" +using namespace llvm; + +enum AsmWriterFlavorTy { + // Note: This numbering has to match the GCC assembler dialects for inline + // asm alternatives to work right. + ATT = 0, Intel = 1 +}; + +static cl::opt<AsmWriterFlavorTy> +AsmWriterFlavor("x86-asm-syntax", cl::init(ATT), + cl::desc("Choose style of code to emit from X86 backend:"), + cl::values(clEnumValN(ATT, "att", "Emit AT&T-style assembly"), + clEnumValN(Intel, "intel", "Emit Intel-style assembly"), + clEnumValEnd)); + + +static const char *const x86_asm_table[] = { + "{si}", "S", + "{di}", "D", + "{ax}", "a", + "{cx}", "c", + "{memory}", "memory", + "{flags}", "", + "{dirflag}", "", + "{fpsr}", "", + "{fpcr}", "", + "{cc}", "cc", + 0,0}; + +X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { + bool is64Bit = T.getArch() == Triple::x86_64; + if (is64Bit) + PointerSize = 8; + + AsmTransCBE = x86_asm_table; + AssemblerDialect = AsmWriterFlavor; + + TextAlignFillValue = 0x90; + + if (!is64Bit) + Data64bitsDirective = 0; // we can't emit a 64-bit unit + + // Use ## as a comment string so that .s files generated by llvm can go + // through the GCC preprocessor without causing an error. This is needed + // because "clang foo.s" runs the C preprocessor, which is usually reserved + // for .S files on other systems. Perhaps this is because the file system + // wasn't always case preserving or something. + CommentString = "##"; + PCSymbol = "."; + + SupportsDebugInformation = true; + DwarfUsesInlineInfoSection = true; + + // Exceptions handling + ExceptionsType = ExceptionHandling::DwarfCFI; +} + +X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple) + : X86MCAsmInfoDarwin(Triple) { +} + +X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { + if (T.getArch() == Triple::x86_64) + PointerSize = 8; + + AsmTransCBE = x86_asm_table; + AssemblerDialect = AsmWriterFlavor; + + TextAlignFillValue = 0x90; + + PrivateGlobalPrefix = ".L"; + WeakRefDirective = "\t.weak\t"; + PCSymbol = "."; + + // Set up DWARF directives + HasLEB128 = true; // Target asm supports leb128 directives (little-endian) + + // Debug Information + SupportsDebugInformation = true; + + // Exceptions handling + ExceptionsType = ExceptionHandling::DwarfCFI; + + // OpenBSD has buggy support for .quad in 32-bit mode, just split into two + // .words. + if (T.getOS() == Triple::OpenBSD && T.getArch() == Triple::x86) + Data64bitsDirective = 0; +} + +const MCExpr * +X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym, + unsigned Encoding, + MCStreamer &Streamer) const { + MCContext &Context = Streamer.getContext(); + const MCExpr *Res = + MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context); + const MCExpr *Four = MCConstantExpr::Create(4, Context); + return MCBinaryExpr::CreateAdd(Res, Four, Context); +} + +const MCSection *X86ELFMCAsmInfo:: +getNonexecutableStackSection(MCContext &Ctx) const { + return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, + 0, SectionKind::getMetadata()); +} + +X86MCAsmInfoCOFF::X86MCAsmInfoCOFF(const Triple &Triple) { + if (Triple.getArch() == Triple::x86_64) { + GlobalPrefix = ""; + PrivateGlobalPrefix = ".L"; + } + + AsmTransCBE = x86_asm_table; + AssemblerDialect = AsmWriterFlavor; + + TextAlignFillValue = 0x90; +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h new file mode 100644 index 0000000..2cd4c8e --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -0,0 +1,46 @@ +//=====-- X86MCAsmInfo.h - X86 asm properties -----------------*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the X86MCAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86TARGETASMINFO_H +#define X86TARGETASMINFO_H + +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmInfoCOFF.h" +#include "llvm/MC/MCAsmInfoDarwin.h" + +namespace llvm { + class Triple; + + struct X86MCAsmInfoDarwin : public MCAsmInfoDarwin { + explicit X86MCAsmInfoDarwin(const Triple &Triple); + }; + + struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin { + explicit X86_64MCAsmInfoDarwin(const Triple &Triple); + virtual const MCExpr * + getExprForPersonalitySymbol(const MCSymbol *Sym, + unsigned Encoding, + MCStreamer &Streamer) const; + }; + + struct X86ELFMCAsmInfo : public MCAsmInfo { + explicit X86ELFMCAsmInfo(const Triple &Triple); + virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const; + }; + + struct X86MCAsmInfoCOFF : public MCAsmInfoCOFF { + explicit X86MCAsmInfoCOFF(const Triple &Triple); + }; +} // namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp new file mode 100644 index 0000000..2eee112 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -0,0 +1,1074 @@ +//===-- X86/X86MCCodeEmitter.cpp - Convert X86 code to machine code -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the X86MCCodeEmitter class. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mccodeemitter" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86FixupKinds.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { +class X86MCCodeEmitter : public MCCodeEmitter { + X86MCCodeEmitter(const X86MCCodeEmitter &); // DO NOT IMPLEMENT + void operator=(const X86MCCodeEmitter &); // DO NOT IMPLEMENT + const MCInstrInfo &MCII; + const MCSubtargetInfo &STI; + MCContext &Ctx; +public: + X86MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti, + MCContext &ctx) + : MCII(mcii), STI(sti), Ctx(ctx) { + } + + ~X86MCCodeEmitter() {} + + bool is64BitMode() const { + // FIXME: Can tablegen auto-generate this? + return (STI.getFeatureBits() & X86::Mode64Bit) != 0; + } + + static unsigned GetX86RegNum(const MCOperand &MO) { + return X86_MC::getX86RegNum(MO.getReg()); + } + + // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range + // 0-7 and the difference between the 2 groups is given by the REX prefix. + // In the VEX prefix, registers are seen sequencially from 0-15 and encoded + // in 1's complement form, example: + // + // ModRM field => XMM9 => 1 + // VEX.VVVV => XMM9 => ~9 + // + // See table 4-35 of Intel AVX Programming Reference for details. + static unsigned char getVEXRegisterEncoding(const MCInst &MI, + unsigned OpNum) { + unsigned SrcReg = MI.getOperand(OpNum).getReg(); + unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum)); + if ((SrcReg >= X86::XMM8 && SrcReg <= X86::XMM15) || + (SrcReg >= X86::YMM8 && SrcReg <= X86::YMM15)) + SrcRegNum += 8; + + // The registers represented through VEX_VVVV should + // be encoded in 1's complement form. + return (~SrcRegNum) & 0xf; + } + + void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const { + OS << (char)C; + ++CurByte; + } + + void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte, + raw_ostream &OS) const { + // Output the constant in little endian byte order. + for (unsigned i = 0; i != Size; ++i) { + EmitByte(Val & 255, CurByte, OS); + Val >>= 8; + } + } + + void EmitImmediate(const MCOperand &Disp, + unsigned ImmSize, MCFixupKind FixupKind, + unsigned &CurByte, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + int ImmOffset = 0) const; + + inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode, + unsigned RM) { + assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!"); + return RM | (RegOpcode << 3) | (Mod << 6); + } + + void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld, + unsigned &CurByte, raw_ostream &OS) const { + EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS); + } + + void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base, + unsigned &CurByte, raw_ostream &OS) const { + // SIB byte is in the same format as the ModRMByte. + EmitByte(ModRMByte(SS, Index, Base), CurByte, OS); + } + + + void EmitMemModRMByte(const MCInst &MI, unsigned Op, + unsigned RegOpcodeField, + uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const; + + void EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const; + + void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, + const MCInst &MI, const MCInstrDesc &Desc, + raw_ostream &OS) const; + + void EmitSegmentOverridePrefix(uint64_t TSFlags, unsigned &CurByte, + int MemOperand, const MCInst &MI, + raw_ostream &OS) const; + + void EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, + const MCInst &MI, const MCInstrDesc &Desc, + raw_ostream &OS) const; +}; + +} // end anonymous namespace + + +MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new X86MCCodeEmitter(MCII, STI, Ctx); +} + +/// isDisp8 - Return true if this signed displacement fits in a 8-bit +/// sign-extended field. +static bool isDisp8(int Value) { + return Value == (signed char)Value; +} + +/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate +/// in an instruction with the specified TSFlags. +static MCFixupKind getImmFixupKind(uint64_t TSFlags) { + unsigned Size = X86II::getSizeOfImm(TSFlags); + bool isPCRel = X86II::isImmPCRel(TSFlags); + + return MCFixup::getKindForSize(Size, isPCRel); +} + +/// Is32BitMemOperand - Return true if the specified instruction with a memory +/// operand should emit the 0x67 prefix byte in 64-bit mode due to a 32-bit +/// memory operand. Op specifies the operand # of the memoperand. +static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) { + const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); + const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + + if ((BaseReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) || + (IndexReg.getReg() != 0 && + X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg()))) + return true; + return false; +} + +/// StartsWithGlobalOffsetTable - Return true for the simple cases where this +/// expression starts with _GLOBAL_OFFSET_TABLE_. This is a needed to support +/// PIC on ELF i386 as that symbol is magic. We check only simple case that +/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start +/// of a binary expression. +static bool StartsWithGlobalOffsetTable(const MCExpr *Expr) { + if (Expr->getKind() == MCExpr::Binary) { + const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr); + Expr = BE->getLHS(); + } + + if (Expr->getKind() != MCExpr::SymbolRef) + return false; + + const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr); + const MCSymbol &S = Ref->getSymbol(); + return S.getName() == "_GLOBAL_OFFSET_TABLE_"; +} + +void X86MCCodeEmitter:: +EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind, + unsigned &CurByte, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const { + const MCExpr *Expr = NULL; + if (DispOp.isImm()) { + // If this is a simple integer displacement that doesn't require a + // relocation, emit it now. + if (FixupKind != FK_PCRel_1 && + FixupKind != FK_PCRel_2 && + FixupKind != FK_PCRel_4) { + EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS); + return; + } + Expr = MCConstantExpr::Create(DispOp.getImm(), Ctx); + } else { + Expr = DispOp.getExpr(); + } + + // If we have an immoffset, add it to the expression. + if ((FixupKind == FK_Data_4 || + FixupKind == MCFixupKind(X86::reloc_signed_4byte)) && + StartsWithGlobalOffsetTable(Expr)) { + assert(ImmOffset == 0); + + FixupKind = MCFixupKind(X86::reloc_global_offset_table); + ImmOffset = CurByte; + } + + // If the fixup is pc-relative, we need to bias the value to be relative to + // the start of the field, not the end of the field. + if (FixupKind == FK_PCRel_4 || + FixupKind == MCFixupKind(X86::reloc_riprel_4byte) || + FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load)) + ImmOffset -= 4; + if (FixupKind == FK_PCRel_2) + ImmOffset -= 2; + if (FixupKind == FK_PCRel_1) + ImmOffset -= 1; + + if (ImmOffset) + Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(ImmOffset, Ctx), + Ctx); + + // Emit a symbolic constant as a fixup and 4 zeros. + Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind)); + EmitConstant(0, Size, CurByte, OS); +} + +void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, + unsigned RegOpcodeField, + uint64_t TSFlags, unsigned &CurByte, + raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const{ + const MCOperand &Disp = MI.getOperand(Op+X86::AddrDisp); + const MCOperand &Base = MI.getOperand(Op+X86::AddrBaseReg); + const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt); + const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); + unsigned BaseReg = Base.getReg(); + + // Handle %rip relative addressing. + if (BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode + assert(is64BitMode() && "Rip-relative addressing requires 64-bit mode"); + assert(IndexReg.getReg() == 0 && "Invalid rip-relative address"); + EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS); + + unsigned FixupKind = X86::reloc_riprel_4byte; + + // movq loads are handled with a special relocation form which allows the + // linker to eliminate some loads for GOT references which end up in the + // same linkage unit. + if (MI.getOpcode() == X86::MOV64rm) + FixupKind = X86::reloc_riprel_4byte_movq_load; + + // rip-relative addressing is actually relative to the *next* instruction. + // Since an immediate can follow the mod/rm byte for an instruction, this + // means that we need to bias the immediate field of the instruction with + // the size of the immediate field. If we have this case, add it into the + // expression to emit. + int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0; + + EmitImmediate(Disp, 4, MCFixupKind(FixupKind), + CurByte, OS, Fixups, -ImmSize); + return; + } + + unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U; + + // Determine whether a SIB byte is needed. + // If no BaseReg, issue a RIP relative instruction only if the MCE can + // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table + // 2-7) and absolute references. + + if (// The SIB byte must be used if there is an index register. + IndexReg.getReg() == 0 && + // The SIB byte must be used if the base is ESP/RSP/R12, all of which + // encode to an R/M value of 4, which indicates that a SIB byte is + // present. + BaseRegNo != N86::ESP && + // If there is no base register and we're in 64-bit mode, we need a SIB + // byte to emit an addr that is just 'disp32' (the non-RIP relative form). + (!is64BitMode() || BaseReg != 0)) { + + if (BaseReg == 0) { // [disp32] in X86-32 mode + EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS); + EmitImmediate(Disp, 4, FK_Data_4, CurByte, OS, Fixups); + return; + } + + // If the base is not EBP/ESP and there is no displacement, use simple + // indirect register encoding, this handles addresses like [EAX]. The + // encoding for [EBP] with no displacement means [disp32] so we handle it + // by emitting a displacement of 0 below. + if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) { + EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS); + return; + } + + // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. + if (Disp.isImm() && isDisp8(Disp.getImm())) { + EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); + EmitImmediate(Disp, 1, FK_Data_1, CurByte, OS, Fixups); + return; + } + + // Otherwise, emit the most general non-SIB encoding: [REG+disp32] + EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS); + EmitImmediate(Disp, 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS, + Fixups); + return; + } + + // We need a SIB byte, so start by outputting the ModR/M byte first + assert(IndexReg.getReg() != X86::ESP && + IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!"); + + bool ForceDisp32 = false; + bool ForceDisp8 = false; + if (BaseReg == 0) { + // If there is no base register, we emit the special case SIB byte with + // MOD=0, BASE=5, to JUST get the index, scale, and displacement. + EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS); + ForceDisp32 = true; + } else if (!Disp.isImm()) { + // Emit the normal disp32 encoding. + EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS); + ForceDisp32 = true; + } else if (Disp.getImm() == 0 && + // Base reg can't be anything that ends up with '5' as the base + // reg, it is the magic [*] nomenclature that indicates no base. + BaseRegNo != N86::EBP) { + // Emit no displacement ModR/M byte + EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS); + } else if (isDisp8(Disp.getImm())) { + // Emit the disp8 encoding. + EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS); + ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP + } else { + // Emit the normal disp32 encoding. + EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS); + } + + // Calculate what the SS field value should be... + static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 }; + unsigned SS = SSTable[Scale.getImm()]; + + if (BaseReg == 0) { + // Handle the SIB byte for the case where there is no base, see Intel + // Manual 2A, table 2-7. The displacement has already been output. + unsigned IndexRegNo; + if (IndexReg.getReg()) + IndexRegNo = GetX86RegNum(IndexReg); + else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5) + IndexRegNo = 4; + EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS); + } else { + unsigned IndexRegNo; + if (IndexReg.getReg()) + IndexRegNo = GetX86RegNum(IndexReg); + else + IndexRegNo = 4; // For example [ESP+1*<noreg>+4] + EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS); + } + + // Do we need to output a displacement? + if (ForceDisp8) + EmitImmediate(Disp, 1, FK_Data_1, CurByte, OS, Fixups); + else if (ForceDisp32 || Disp.getImm() != 0) + EmitImmediate(Disp, 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS, + Fixups); +} + +/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix +/// called VEX. +void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, + int MemOperand, const MCInst &MI, + const MCInstrDesc &Desc, + raw_ostream &OS) const { + bool HasVEX_4V = false; + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_4V) + HasVEX_4V = true; + + // VEX_R: opcode externsion equivalent to REX.R in + // 1's complement (inverted) form + // + // 1: Same as REX_R=0 (must be 1 in 32-bit mode) + // 0: Same as REX_R=1 (64 bit mode only) + // + unsigned char VEX_R = 0x1; + + // VEX_X: equivalent to REX.X, only used when a + // register is used for index in SIB Byte. + // + // 1: Same as REX.X=0 (must be 1 in 32-bit mode) + // 0: Same as REX.X=1 (64-bit mode only) + unsigned char VEX_X = 0x1; + + // VEX_B: + // + // 1: Same as REX_B=0 (ignored in 32-bit mode) + // 0: Same as REX_B=1 (64 bit mode only) + // + unsigned char VEX_B = 0x1; + + // VEX_W: opcode specific (use like REX.W, or used for + // opcode extension, or ignored, depending on the opcode byte) + unsigned char VEX_W = 0; + + // VEX_5M (VEX m-mmmmm field): + // + // 0b00000: Reserved for future use + // 0b00001: implied 0F leading opcode + // 0b00010: implied 0F 38 leading opcode bytes + // 0b00011: implied 0F 3A leading opcode bytes + // 0b00100-0b11111: Reserved for future use + // + unsigned char VEX_5M = 0x1; + + // VEX_4V (VEX vvvv field): a register specifier + // (in 1's complement form) or 1111 if unused. + unsigned char VEX_4V = 0xf; + + // VEX_L (Vector Length): + // + // 0: scalar or 128-bit vector + // 1: 256-bit vector + // + unsigned char VEX_L = 0; + + // VEX_PP: opcode extension providing equivalent + // functionality of a SIMD prefix + // + // 0b00: None + // 0b01: 66 + // 0b10: F3 + // 0b11: F2 + // + unsigned char VEX_PP = 0; + + // Encode the operand size opcode prefix as needed. + if (TSFlags & X86II::OpSize) + VEX_PP = 0x01; + + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W) + VEX_W = 1; + + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) + VEX_L = 1; + + switch (TSFlags & X86II::Op0Mask) { + default: assert(0 && "Invalid prefix!"); + case X86II::T8: // 0F 38 + VEX_5M = 0x2; + break; + case X86II::TA: // 0F 3A + VEX_5M = 0x3; + break; + case X86II::TF: // F2 0F 38 + VEX_PP = 0x3; + VEX_5M = 0x2; + break; + case X86II::XS: // F3 0F + VEX_PP = 0x2; + break; + case X86II::XD: // F2 0F + VEX_PP = 0x3; + break; + case X86II::A6: // Bypass: Not used by VEX + case X86II::A7: // Bypass: Not used by VEX + case X86II::TB: // Bypass: Not used by VEX + case 0: + break; // No prefix! + } + + // Set the vector length to 256-bit if YMM0-YMM15 is used + for (unsigned i = 0; i != MI.getNumOperands(); ++i) { + if (!MI.getOperand(i).isReg()) + continue; + unsigned SrcReg = MI.getOperand(i).getReg(); + if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15) + VEX_L = 1; + } + + // Classify VEX_B, VEX_4V, VEX_R, VEX_X + unsigned CurOp = 0; + switch (TSFlags & X86II::FormMask) { + case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!"); + case X86II::MRMDestMem: { + // MRMDestMem instructions forms: + // MemAddr, src1(ModR/M) + // MemAddr, src1(VEX_4V), src2(ModR/M) + // MemAddr, src1(ModR/M), imm8 + // + if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg())) + VEX_X = 0x0; + + CurOp = X86::AddrNumOperands; + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + + const MCOperand &MO = MI.getOperand(CurOp); + if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg())) + VEX_R = 0x0; + break; + } + case X86II::MRMSrcMem: { + // MRMSrcMem instructions forms: + // src1(ModR/M), MemAddr + // src1(ModR/M), src2(VEX_4V), MemAddr + // src1(ModR/M), MemAddr, imm8 + // src1(ModR/M), MemAddr, src2(VEX_I8IMM) + // + if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + VEX_R = 0x0; + + unsigned MemAddrOffset = 1; + if (HasVEX_4V) { + VEX_4V = getVEXRegisterEncoding(MI, 1); + MemAddrOffset++; + } + + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemAddrOffset+X86::AddrBaseReg).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg( + MI.getOperand(MemAddrOffset+X86::AddrIndexReg).getReg())) + VEX_X = 0x0; + break; + } + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: + // MRM[0-9]m instructions forms: + // MemAddr + if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg())) + VEX_X = 0x0; + break; + case X86II::MRMSrcReg: + // MRMSrcReg instructions forms: + // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) + // dst(ModR/M), src1(ModR/M) + // dst(ModR/M), src1(ModR/M), imm8 + // + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_R = 0x0; + CurOp++; + + if (HasVEX_4V) + VEX_4V = getVEXRegisterEncoding(MI, CurOp++); + if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) + VEX_B = 0x0; + break; + case X86II::MRMDestReg: + // MRMDestReg instructions forms: + // dst(ModR/M), src(ModR/M) + // dst(ModR/M), src(ModR/M), imm8 + if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + VEX_B = 0x0; + if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg())) + VEX_R = 0x0; + break; + case X86II::MRM0r: case X86II::MRM1r: + case X86II::MRM2r: case X86II::MRM3r: + case X86II::MRM4r: case X86II::MRM5r: + case X86II::MRM6r: case X86II::MRM7r: + // MRM0r-MRM7r instructions forms: + // dst(VEX_4V), src(ModR/M), imm8 + VEX_4V = getVEXRegisterEncoding(MI, 0); + if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg())) + VEX_B = 0x0; + break; + default: // RawFrm + break; + } + + // Emit segment override opcode prefix as needed. + EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS); + + // VEX opcode prefix can have 2 or 3 bytes + // + // 3 bytes: + // +-----+ +--------------+ +-------------------+ + // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp | + // +-----+ +--------------+ +-------------------+ + // 2 bytes: + // +-----+ +-------------------+ + // | C5h | | R | vvvv | L | pp | + // +-----+ +-------------------+ + // + unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); + + if (VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { // 2 byte VEX prefix + EmitByte(0xC5, CurByte, OS); + EmitByte(LastByte | (VEX_R << 7), CurByte, OS); + return; + } + + // 3 byte VEX prefix + EmitByte(0xC4, CurByte, OS); + EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS); + EmitByte(LastByte | (VEX_W << 7), CurByte, OS); +} + +/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64 +/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand +/// size, and 3) use of X86-64 extended registers. +static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, + const MCInstrDesc &Desc) { + unsigned REX = 0; + if (TSFlags & X86II::REX_W) + REX |= 1 << 3; // set REX.W + + if (MI.getNumOperands() == 0) return REX; + + unsigned NumOps = MI.getNumOperands(); + // FIXME: MCInst should explicitize the two-addrness. + bool isTwoAddr = NumOps > 1 && + Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1; + + // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix. + unsigned i = isTwoAddr ? 1 : 0; + for (; i != NumOps; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (!MO.isReg()) continue; + unsigned Reg = MO.getReg(); + if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue; + // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything + // that returns non-zero. + REX |= 0x40; // REX fixed encoding prefix + break; + } + + switch (TSFlags & X86II::FormMask) { + case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!"); + case X86II::MRMSrcReg: + if (MI.getOperand(0).isReg() && + X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + REX |= 1 << 2; // set REX.R + i = isTwoAddr ? 2 : 1; + for (; i != NumOps; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg())) + REX |= 1 << 0; // set REX.B + } + break; + case X86II::MRMSrcMem: { + if (MI.getOperand(0).isReg() && + X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + REX |= 1 << 2; // set REX.R + unsigned Bit = 0; + i = isTwoAddr ? 2 : 1; + for (; i != NumOps; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (MO.isReg()) { + if (X86II::isX86_64ExtendedReg(MO.getReg())) + REX |= 1 << Bit; // set REX.B (Bit=0) and REX.X (Bit=1) + Bit++; + } + } + break; + } + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: + case X86II::MRMDestMem: { + unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands); + i = isTwoAddr ? 1 : 0; + if (NumOps > e && MI.getOperand(e).isReg() && + X86II::isX86_64ExtendedReg(MI.getOperand(e).getReg())) + REX |= 1 << 2; // set REX.R + unsigned Bit = 0; + for (; i != e; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (MO.isReg()) { + if (X86II::isX86_64ExtendedReg(MO.getReg())) + REX |= 1 << Bit; // REX.B (Bit=0) and REX.X (Bit=1) + Bit++; + } + } + break; + } + default: + if (MI.getOperand(0).isReg() && + X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) + REX |= 1 << 0; // set REX.B + i = isTwoAddr ? 2 : 1; + for (unsigned e = NumOps; i != e; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg())) + REX |= 1 << 2; // set REX.R + } + break; + } + return REX; +} + +/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed +void X86MCCodeEmitter::EmitSegmentOverridePrefix(uint64_t TSFlags, + unsigned &CurByte, int MemOperand, + const MCInst &MI, + raw_ostream &OS) const { + switch (TSFlags & X86II::SegOvrMask) { + default: assert(0 && "Invalid segment!"); + case 0: + // No segment override, check for explicit one on memory operand. + if (MemOperand != -1) { // If the instruction has a memory operand. + switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) { + default: assert(0 && "Unknown segment register!"); + case 0: break; + case X86::CS: EmitByte(0x2E, CurByte, OS); break; + case X86::SS: EmitByte(0x36, CurByte, OS); break; + case X86::DS: EmitByte(0x3E, CurByte, OS); break; + case X86::ES: EmitByte(0x26, CurByte, OS); break; + case X86::FS: EmitByte(0x64, CurByte, OS); break; + case X86::GS: EmitByte(0x65, CurByte, OS); break; + } + } + break; + case X86II::FS: + EmitByte(0x64, CurByte, OS); + break; + case X86II::GS: + EmitByte(0x65, CurByte, OS); + break; + } +} + +/// EmitOpcodePrefix - Emit all instruction prefixes prior to the opcode. +/// +/// MemOperand is the operand # of the start of a memory operand if present. If +/// Not present, it is -1. +void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, + int MemOperand, const MCInst &MI, + const MCInstrDesc &Desc, + raw_ostream &OS) const { + + // Emit the lock opcode prefix as needed. + if (TSFlags & X86II::LOCK) + EmitByte(0xF0, CurByte, OS); + + // Emit segment override opcode prefix as needed. + EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS); + + // Emit the repeat opcode prefix as needed. + if ((TSFlags & X86II::Op0Mask) == X86II::REP) + EmitByte(0xF3, CurByte, OS); + + // Emit the address size opcode prefix as needed. + if ((TSFlags & X86II::AdSize) || + (MemOperand != -1 && is64BitMode() && Is32BitMemOperand(MI, MemOperand))) + EmitByte(0x67, CurByte, OS); + + // Emit the operand size opcode prefix as needed. + if (TSFlags & X86II::OpSize) + EmitByte(0x66, CurByte, OS); + + bool Need0FPrefix = false; + switch (TSFlags & X86II::Op0Mask) { + default: assert(0 && "Invalid prefix!"); + case 0: break; // No prefix! + case X86II::REP: break; // already handled. + case X86II::TB: // Two-byte opcode prefix + case X86II::T8: // 0F 38 + case X86II::TA: // 0F 3A + case X86II::A6: // 0F A6 + case X86II::A7: // 0F A7 + Need0FPrefix = true; + break; + case X86II::TF: // F2 0F 38 + EmitByte(0xF2, CurByte, OS); + Need0FPrefix = true; + break; + case X86II::XS: // F3 0F + EmitByte(0xF3, CurByte, OS); + Need0FPrefix = true; + break; + case X86II::XD: // F2 0F + EmitByte(0xF2, CurByte, OS); + Need0FPrefix = true; + break; + case X86II::D8: EmitByte(0xD8, CurByte, OS); break; + case X86II::D9: EmitByte(0xD9, CurByte, OS); break; + case X86II::DA: EmitByte(0xDA, CurByte, OS); break; + case X86II::DB: EmitByte(0xDB, CurByte, OS); break; + case X86II::DC: EmitByte(0xDC, CurByte, OS); break; + case X86II::DD: EmitByte(0xDD, CurByte, OS); break; + case X86II::DE: EmitByte(0xDE, CurByte, OS); break; + case X86II::DF: EmitByte(0xDF, CurByte, OS); break; + } + + // Handle REX prefix. + // FIXME: Can this come before F2 etc to simplify emission? + if (is64BitMode()) { + if (unsigned REX = DetermineREXPrefix(MI, TSFlags, Desc)) + EmitByte(0x40 | REX, CurByte, OS); + } + + // 0x0F escape code must be emitted just before the opcode. + if (Need0FPrefix) + EmitByte(0x0F, CurByte, OS); + + // FIXME: Pull this up into previous switch if REX can be moved earlier. + switch (TSFlags & X86II::Op0Mask) { + case X86II::TF: // F2 0F 38 + case X86II::T8: // 0F 38 + EmitByte(0x38, CurByte, OS); + break; + case X86II::TA: // 0F 3A + EmitByte(0x3A, CurByte, OS); + break; + case X86II::A6: // 0F A6 + EmitByte(0xA6, CurByte, OS); + break; + case X86II::A7: // 0F A7 + EmitByte(0xA7, CurByte, OS); + break; + } +} + +void X86MCCodeEmitter:: +EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const { + unsigned Opcode = MI.getOpcode(); + const MCInstrDesc &Desc = MCII.get(Opcode); + uint64_t TSFlags = Desc.TSFlags; + + // Pseudo instructions don't get encoded. + if ((TSFlags & X86II::FormMask) == X86II::Pseudo) + return; + + // If this is a two-address instruction, skip one of the register operands. + // FIXME: This should be handled during MCInst lowering. + unsigned NumOps = Desc.getNumOperands(); + unsigned CurOp = 0; + if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1) + ++CurOp; + else if (NumOps > 2 && Desc.getOperandConstraint(NumOps-1, MCOI::TIED_TO)== 0) + // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32 + --NumOps; + + // Keep track of the current byte being emitted. + unsigned CurByte = 0; + + // Is this instruction encoded using the AVX VEX prefix? + bool HasVEXPrefix = false; + + // It uses the VEX.VVVV field? + bool HasVEX_4V = false; + + if ((TSFlags >> X86II::VEXShift) & X86II::VEX) + HasVEXPrefix = true; + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_4V) + HasVEX_4V = true; + + // Determine where the memory operand starts, if present. + int MemoryOperand = X86II::getMemoryOperandNo(TSFlags); + if (MemoryOperand != -1) MemoryOperand += CurOp; + + if (!HasVEXPrefix) + EmitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS); + else + EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS); + + unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags); + + if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode) + BaseOpcode = 0x0F; // Weird 3DNow! encoding. + + unsigned SrcRegNum = 0; + switch (TSFlags & X86II::FormMask) { + case X86II::MRMInitReg: + assert(0 && "FIXME: Remove this form when the JIT moves to MCCodeEmitter!"); + default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n"; + assert(0 && "Unknown FormMask value in X86MCCodeEmitter!"); + case X86II::Pseudo: + assert(0 && "Pseudo instruction shouldn't be emitted"); + case X86II::RawFrm: + EmitByte(BaseOpcode, CurByte, OS); + break; + case X86II::RawFrmImm8: + EmitByte(BaseOpcode, CurByte, OS); + EmitImmediate(MI.getOperand(CurOp++), + X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), + CurByte, OS, Fixups); + EmitImmediate(MI.getOperand(CurOp++), 1, FK_Data_1, CurByte, OS, Fixups); + break; + case X86II::RawFrmImm16: + EmitByte(BaseOpcode, CurByte, OS); + EmitImmediate(MI.getOperand(CurOp++), + X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), + CurByte, OS, Fixups); + EmitImmediate(MI.getOperand(CurOp++), 2, FK_Data_2, CurByte, OS, Fixups); + break; + + case X86II::AddRegFrm: + EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS); + break; + + case X86II::MRMDestReg: + EmitByte(BaseOpcode, CurByte, OS); + EmitRegModRMByte(MI.getOperand(CurOp), + GetX86RegNum(MI.getOperand(CurOp+1)), CurByte, OS); + CurOp += 2; + break; + + case X86II::MRMDestMem: + EmitByte(BaseOpcode, CurByte, OS); + SrcRegNum = CurOp + X86::AddrNumOperands; + + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) + SrcRegNum++; + + EmitMemModRMByte(MI, CurOp, + GetX86RegNum(MI.getOperand(SrcRegNum)), + TSFlags, CurByte, OS, Fixups); + CurOp = SrcRegNum + 1; + break; + + case X86II::MRMSrcReg: + EmitByte(BaseOpcode, CurByte, OS); + SrcRegNum = CurOp + 1; + + if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) + SrcRegNum++; + + EmitRegModRMByte(MI.getOperand(SrcRegNum), + GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS); + CurOp = SrcRegNum + 1; + break; + + case X86II::MRMSrcMem: { + int AddrOperands = X86::AddrNumOperands; + unsigned FirstMemOp = CurOp+1; + if (HasVEX_4V) { + ++AddrOperands; + ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). + } + + EmitByte(BaseOpcode, CurByte, OS); + + EmitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)), + TSFlags, CurByte, OS, Fixups); + CurOp += AddrOperands + 1; + break; + } + + case X86II::MRM0r: case X86II::MRM1r: + case X86II::MRM2r: case X86II::MRM3r: + case X86II::MRM4r: case X86II::MRM5r: + case X86II::MRM6r: case X86II::MRM7r: + if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). + CurOp++; + EmitByte(BaseOpcode, CurByte, OS); + EmitRegModRMByte(MI.getOperand(CurOp++), + (TSFlags & X86II::FormMask)-X86II::MRM0r, + CurByte, OS); + break; + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: + EmitByte(BaseOpcode, CurByte, OS); + EmitMemModRMByte(MI, CurOp, (TSFlags & X86II::FormMask)-X86II::MRM0m, + TSFlags, CurByte, OS, Fixups); + CurOp += X86::AddrNumOperands; + break; + case X86II::MRM_C1: + EmitByte(BaseOpcode, CurByte, OS); + EmitByte(0xC1, CurByte, OS); + break; + case X86II::MRM_C2: + EmitByte(BaseOpcode, CurByte, OS); + EmitByte(0xC2, CurByte, OS); + break; + case X86II::MRM_C3: + EmitByte(BaseOpcode, CurByte, OS); + EmitByte(0xC3, CurByte, OS); + break; + case X86II::MRM_C4: + EmitByte(BaseOpcode, CurByte, OS); + EmitByte(0xC4, CurByte, OS); + break; + case X86II::MRM_C8: + EmitByte(BaseOpcode, CurByte, OS); + EmitByte(0xC8, CurByte, OS); + break; + case X86II::MRM_C9: + EmitByte(BaseOpcode, CurByte, OS); + EmitByte(0xC9, CurByte, OS); + break; + case X86II::MRM_E8: + EmitByte(BaseOpcode, CurByte, OS); + EmitByte(0xE8, CurByte, OS); + break; + case X86II::MRM_F0: + EmitByte(BaseOpcode, CurByte, OS); + EmitByte(0xF0, CurByte, OS); + break; + case X86II::MRM_F8: + EmitByte(BaseOpcode, CurByte, OS); + EmitByte(0xF8, CurByte, OS); + break; + case X86II::MRM_F9: + EmitByte(BaseOpcode, CurByte, OS); + EmitByte(0xF9, CurByte, OS); + break; + case X86II::MRM_D0: + EmitByte(BaseOpcode, CurByte, OS); + EmitByte(0xD0, CurByte, OS); + break; + case X86II::MRM_D1: + EmitByte(BaseOpcode, CurByte, OS); + EmitByte(0xD1, CurByte, OS); + break; + } + + // If there is a remaining operand, it must be a trailing immediate. Emit it + // according to the right size for the instruction. + if (CurOp != NumOps) { + // The last source register of a 4 operand instruction in AVX is encoded + // in bits[7:4] of a immediate byte, and bits[3:0] are ignored. + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) { + const MCOperand &MO = MI.getOperand(CurOp++); + bool IsExtReg = X86II::isX86_64ExtendedReg(MO.getReg()); + unsigned RegNum = (IsExtReg ? (1 << 7) : 0); + RegNum |= GetX86RegNum(MO) << 4; + EmitImmediate(MCOperand::CreateImm(RegNum), 1, FK_Data_1, CurByte, OS, + Fixups); + } else { + unsigned FixupKind; + // FIXME: Is there a better way to know that we need a signed relocation? + if (MI.getOpcode() == X86::ADD64ri32 || + MI.getOpcode() == X86::MOV64ri32 || + MI.getOpcode() == X86::MOV64mi32 || + MI.getOpcode() == X86::PUSH64i32) + FixupKind = X86::reloc_signed_4byte; + else + FixupKind = getImmFixupKind(TSFlags); + EmitImmediate(MI.getOperand(CurOp++), + X86II::getSizeOfImm(TSFlags), MCFixupKind(FixupKind), + CurByte, OS, Fixups); + } + } + + if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode) + EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS); + +#ifndef NDEBUG + // FIXME: Verify. + if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) { + errs() << "Cannot encode all operands of: "; + MI.dump(); + errs() << '\n'; + abort(); + } +#endif +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp new file mode 100644 index 0000000..f98d5e3 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -0,0 +1,463 @@ +//===-- X86MCTargetDesc.cpp - X86 Target Descriptions -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides X86 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "X86MCTargetDesc.h" +#include "X86MCAsmInfo.h" +#include "InstPrinter/X86ATTInstPrinter.h" +#include "InstPrinter/X86IntelInstPrinter.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Support/Host.h" +#include "llvm/Support/TargetRegistry.h" + +#define GET_REGINFO_MC_DESC +#include "X86GenRegisterInfo.inc" + +#define GET_INSTRINFO_MC_DESC +#include "X86GenInstrInfo.inc" + +#define GET_SUBTARGETINFO_MC_DESC +#include "X86GenSubtargetInfo.inc" + +using namespace llvm; + + +std::string X86_MC::ParseX86Triple(StringRef TT) { + Triple TheTriple(TT); + std::string FS; + if (TheTriple.getArch() == Triple::x86_64) + FS = "+64bit-mode"; + else + FS = "-64bit-mode"; + if (TheTriple.getOS() == Triple::NativeClient) + FS += ",+nacl-mode"; + else + FS += ",-nacl-mode"; + return FS; +} + +/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the +/// specified arguments. If we can't run cpuid on the host, return true. +bool X86_MC::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, + unsigned *rEBX, unsigned *rECX, unsigned *rEDX) { +#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64) + #if defined(__GNUC__) + // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually. + asm ("movq\t%%rbx, %%rsi\n\t" + "cpuid\n\t" + "xchgq\t%%rbx, %%rsi\n\t" + : "=a" (*rEAX), + "=S" (*rEBX), + "=c" (*rECX), + "=d" (*rEDX) + : "a" (value)); + return false; + #elif defined(_MSC_VER) + int registers[4]; + __cpuid(registers, value); + *rEAX = registers[0]; + *rEBX = registers[1]; + *rECX = registers[2]; + *rEDX = registers[3]; + return false; + #endif +#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86) + #if defined(__GNUC__) + asm ("movl\t%%ebx, %%esi\n\t" + "cpuid\n\t" + "xchgl\t%%ebx, %%esi\n\t" + : "=a" (*rEAX), + "=S" (*rEBX), + "=c" (*rECX), + "=d" (*rEDX) + : "a" (value)); + return false; + #elif defined(_MSC_VER) + __asm { + mov eax,value + cpuid + mov esi,rEAX + mov dword ptr [esi],eax + mov esi,rEBX + mov dword ptr [esi],ebx + mov esi,rECX + mov dword ptr [esi],ecx + mov esi,rEDX + mov dword ptr [esi],edx + } + return false; + #endif +#endif + return true; +} + +void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family, + unsigned &Model) { + Family = (EAX >> 8) & 0xf; // Bits 8 - 11 + Model = (EAX >> 4) & 0xf; // Bits 4 - 7 + if (Family == 6 || Family == 0xf) { + if (Family == 0xf) + // Examine extended family ID if family ID is F. + Family += (EAX >> 20) & 0xff; // Bits 20 - 27 + // Examine extended model ID if family ID is 6 or F. + Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19 + } +} + +unsigned X86_MC::getDwarfRegFlavour(StringRef TT, bool isEH) { + Triple TheTriple(TT); + if (TheTriple.getArch() == Triple::x86_64) + return DWARFFlavour::X86_64; + + if (TheTriple.isOSDarwin()) + return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic; + if (TheTriple.getOS() == Triple::MinGW32 || + TheTriple.getOS() == Triple::Cygwin) + // Unsupported by now, just quick fallback + return DWARFFlavour::X86_32_Generic; + return DWARFFlavour::X86_32_Generic; +} + +/// getX86RegNum - This function maps LLVM register identifiers to their X86 +/// specific numbering, which is used in various places encoding instructions. +unsigned X86_MC::getX86RegNum(unsigned RegNo) { + switch(RegNo) { + case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX; + case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX; + case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX; + case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX; + case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH: + return N86::ESP; + case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH: + return N86::EBP; + case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH: + return N86::ESI; + case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH: + return N86::EDI; + + case X86::R8: case X86::R8D: case X86::R8W: case X86::R8B: + return N86::EAX; + case X86::R9: case X86::R9D: case X86::R9W: case X86::R9B: + return N86::ECX; + case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B: + return N86::EDX; + case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B: + return N86::EBX; + case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B: + return N86::ESP; + case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B: + return N86::EBP; + case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B: + return N86::ESI; + case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B: + return N86::EDI; + + case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3: + case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7: + return RegNo-X86::ST0; + + case X86::XMM0: case X86::XMM8: + case X86::YMM0: case X86::YMM8: case X86::MM0: + return 0; + case X86::XMM1: case X86::XMM9: + case X86::YMM1: case X86::YMM9: case X86::MM1: + return 1; + case X86::XMM2: case X86::XMM10: + case X86::YMM2: case X86::YMM10: case X86::MM2: + return 2; + case X86::XMM3: case X86::XMM11: + case X86::YMM3: case X86::YMM11: case X86::MM3: + return 3; + case X86::XMM4: case X86::XMM12: + case X86::YMM4: case X86::YMM12: case X86::MM4: + return 4; + case X86::XMM5: case X86::XMM13: + case X86::YMM5: case X86::YMM13: case X86::MM5: + return 5; + case X86::XMM6: case X86::XMM14: + case X86::YMM6: case X86::YMM14: case X86::MM6: + return 6; + case X86::XMM7: case X86::XMM15: + case X86::YMM7: case X86::YMM15: case X86::MM7: + return 7; + + case X86::ES: return 0; + case X86::CS: return 1; + case X86::SS: return 2; + case X86::DS: return 3; + case X86::FS: return 4; + case X86::GS: return 5; + + case X86::CR0: case X86::CR8 : case X86::DR0: return 0; + case X86::CR1: case X86::CR9 : case X86::DR1: return 1; + case X86::CR2: case X86::CR10: case X86::DR2: return 2; + case X86::CR3: case X86::CR11: case X86::DR3: return 3; + case X86::CR4: case X86::CR12: case X86::DR4: return 4; + case X86::CR5: case X86::CR13: case X86::DR5: return 5; + case X86::CR6: case X86::CR14: case X86::DR6: return 6; + case X86::CR7: case X86::CR15: case X86::DR7: return 7; + + // Pseudo index registers are equivalent to a "none" + // scaled index (See Intel Manual 2A, table 2-3) + case X86::EIZ: + case X86::RIZ: + return 4; + + default: + assert((int(RegNo) > 0) && "Unknown physical register!"); + return 0; + } +} + +void X86_MC::InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI) { + // FIXME: TableGen these. + for (unsigned Reg = X86::NoRegister+1; Reg < X86::NUM_TARGET_REGS; ++Reg) { + int SEH = X86_MC::getX86RegNum(Reg); + switch (Reg) { + case X86::R8: case X86::R8D: case X86::R8W: case X86::R8B: + case X86::R9: case X86::R9D: case X86::R9W: case X86::R9B: + case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B: + case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B: + case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B: + case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B: + case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B: + case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B: + case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11: + case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15: + case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11: + case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15: + SEH += 8; + break; + } + MRI->mapLLVMRegToSEHReg(Reg, SEH); + } +} + +MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU, + StringRef FS) { + std::string ArchFS = X86_MC::ParseX86Triple(TT); + if (!FS.empty()) { + if (!ArchFS.empty()) + ArchFS = ArchFS + "," + FS.str(); + else + ArchFS = FS; + } + + std::string CPUName = CPU; + if (CPUName.empty()) { +#if defined (__x86_64__) || defined(__i386__) + CPUName = sys::getHostCPUName(); +#else + CPUName = "generic"; +#endif + } + + MCSubtargetInfo *X = new MCSubtargetInfo(); + InitX86MCSubtargetInfo(X, TT, CPUName, ArchFS); + return X; +} + +static MCInstrInfo *createX86MCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitX86MCInstrInfo(X); + return X; +} + +static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) { + Triple TheTriple(TT); + unsigned RA = (TheTriple.getArch() == Triple::x86_64) + ? X86::RIP // Should have dwarf #16. + : X86::EIP; // Should have dwarf #8. + + MCRegisterInfo *X = new MCRegisterInfo(); + InitX86MCRegisterInfo(X, RA, + X86_MC::getDwarfRegFlavour(TT, false), + X86_MC::getDwarfRegFlavour(TT, true)); + X86_MC::InitLLVM2SEHRegisterMapping(X); + return X; +} + +static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) { + Triple TheTriple(TT); + bool is64Bit = TheTriple.getArch() == Triple::x86_64; + + MCAsmInfo *MAI; + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) { + if (is64Bit) + MAI = new X86_64MCAsmInfoDarwin(TheTriple); + else + MAI = new X86MCAsmInfoDarwin(TheTriple); + } else if (TheTriple.isOSWindows()) { + MAI = new X86MCAsmInfoCOFF(TheTriple); + } else { + MAI = new X86ELFMCAsmInfo(TheTriple); + } + + // Initialize initial frame state. + // Calculate amount of bytes used for return address storing + int stackGrowth = is64Bit ? -8 : -4; + + // Initial state of the frame pointer is esp+stackGrowth. + MachineLocation Dst(MachineLocation::VirtualFP); + MachineLocation Src(is64Bit ? X86::RSP : X86::ESP, stackGrowth); + MAI->addInitialFrameState(0, Dst, Src); + + // Add return address to move list + MachineLocation CSDst(is64Bit ? X86::RSP : X86::ESP, stackGrowth); + MachineLocation CSSrc(is64Bit ? X86::RIP : X86::EIP); + MAI->addInitialFrameState(0, CSDst, CSSrc); + + return MAI; +} + +static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM, + CodeModel::Model CM) { + MCCodeGenInfo *X = new MCCodeGenInfo(); + + Triple T(TT); + bool is64Bit = T.getArch() == Triple::x86_64; + + if (RM == Reloc::Default) { + // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode. + // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we + // use static relocation model by default. + if (T.isOSDarwin()) { + if (is64Bit) + RM = Reloc::PIC_; + else + RM = Reloc::DynamicNoPIC; + } else if (T.isOSWindows() && is64Bit) + RM = Reloc::PIC_; + else + RM = Reloc::Static; + } + + // ELF and X86-64 don't have a distinct DynamicNoPIC model. DynamicNoPIC + // is defined as a model for code which may be used in static or dynamic + // executables but not necessarily a shared library. On X86-32 we just + // compile in -static mode, in x86-64 we use PIC. + if (RM == Reloc::DynamicNoPIC) { + if (is64Bit) + RM = Reloc::PIC_; + else if (!T.isOSDarwin()) + RM = Reloc::Static; + } + + // If we are on Darwin, disallow static relocation model in X86-64 mode, since + // the Mach-O file format doesn't support it. + if (RM == Reloc::Static && T.isOSDarwin() && is64Bit) + RM = Reloc::PIC_; + + // For static codegen, if we're not already set, use Small codegen. + if (CM == CodeModel::Default) + CM = CodeModel::Small; + else if (CM == CodeModel::JITDefault) + // 64-bit JIT places everything in the same buffer except external funcs. + CM = is64Bit ? CodeModel::Large : CodeModel::Small; + + X->InitMCCodeGenInfo(RM, CM); + return X; +} + +static MCStreamer *createMCStreamer(const Target &T, StringRef TT, + MCContext &Ctx, MCAsmBackend &MAB, + raw_ostream &_OS, + MCCodeEmitter *_Emitter, + bool RelaxAll, + bool NoExecStack) { + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) + return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll); + + if (TheTriple.isOSWindows()) + return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll); + + return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack); +} + +static MCInstPrinter *createX86MCInstPrinter(const Target &T, + unsigned SyntaxVariant, + const MCAsmInfo &MAI, + const MCSubtargetInfo &STI) { + if (SyntaxVariant == 0) + return new X86ATTInstPrinter(MAI); + if (SyntaxVariant == 1) + return new X86IntelInstPrinter(MAI); + return 0; +} + +static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) { + return new MCInstrAnalysis(Info); +} + +// Force static initialization. +extern "C" void LLVMInitializeX86TargetMC() { + // Register the MC asm info. + RegisterMCAsmInfoFn A(TheX86_32Target, createX86MCAsmInfo); + RegisterMCAsmInfoFn B(TheX86_64Target, createX86MCAsmInfo); + + // Register the MC codegen info. + RegisterMCCodeGenInfoFn C(TheX86_32Target, createX86MCCodeGenInfo); + RegisterMCCodeGenInfoFn D(TheX86_64Target, createX86MCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(TheX86_32Target, createX86MCInstrInfo); + TargetRegistry::RegisterMCInstrInfo(TheX86_64Target, createX86MCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(TheX86_32Target, createX86MCRegisterInfo); + TargetRegistry::RegisterMCRegInfo(TheX86_64Target, createX86MCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(TheX86_32Target, + X86_MC::createX86MCSubtargetInfo); + TargetRegistry::RegisterMCSubtargetInfo(TheX86_64Target, + X86_MC::createX86MCSubtargetInfo); + + // Register the MC instruction analyzer. + TargetRegistry::RegisterMCInstrAnalysis(TheX86_32Target, + createX86MCInstrAnalysis); + TargetRegistry::RegisterMCInstrAnalysis(TheX86_64Target, + createX86MCInstrAnalysis); + + // Register the code emitter. + TargetRegistry::RegisterMCCodeEmitter(TheX86_32Target, + createX86MCCodeEmitter); + TargetRegistry::RegisterMCCodeEmitter(TheX86_64Target, + createX86MCCodeEmitter); + + // Register the asm backend. + TargetRegistry::RegisterMCAsmBackend(TheX86_32Target, + createX86_32AsmBackend); + TargetRegistry::RegisterMCAsmBackend(TheX86_64Target, + createX86_64AsmBackend); + + // Register the object streamer. + TargetRegistry::RegisterMCObjectStreamer(TheX86_32Target, + createMCStreamer); + TargetRegistry::RegisterMCObjectStreamer(TheX86_64Target, + createMCStreamer); + + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(TheX86_32Target, + createX86MCInstPrinter); + TargetRegistry::RegisterMCInstPrinter(TheX86_64Target, + createX86MCInstPrinter); +} diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h new file mode 100644 index 0000000..c144c51 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -0,0 +1,103 @@ +//===-- X86MCTargetDesc.h - X86 Target Descriptions -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides X86 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#ifndef X86MCTARGETDESC_H +#define X86MCTARGETDESC_H + +#include "llvm/Support/DataTypes.h" +#include <string> + +namespace llvm { +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCInstrInfo; +class MCObjectWriter; +class MCRegisterInfo; +class MCSubtargetInfo; +class Target; +class StringRef; +class raw_ostream; + +extern Target TheX86_32Target, TheX86_64Target; + +/// DWARFFlavour - Flavour of dwarf regnumbers +/// +namespace DWARFFlavour { + enum { + X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2 + }; +} + +/// N86 namespace - Native X86 register numbers +/// +namespace N86 { + enum { + EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7 + }; +} + +namespace X86_MC { + std::string ParseX86Triple(StringRef TT); + + /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in + /// the specified arguments. If we can't run cpuid on the host, return true. + bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX, + unsigned *rEBX, unsigned *rECX, unsigned *rEDX); + + void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model); + + unsigned getDwarfRegFlavour(StringRef TT, bool isEH); + + unsigned getX86RegNum(unsigned RegNo); + + void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI); + + /// createX86MCSubtargetInfo - Create a X86 MCSubtargetInfo instance. + /// This is exposed so Asm parser, etc. do not need to go through + /// TargetRegistry. + MCSubtargetInfo *createX86MCSubtargetInfo(StringRef TT, StringRef CPU, + StringRef FS); +} + +MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII, + const MCSubtargetInfo &STI, + MCContext &Ctx); + +MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT); +MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT); + +/// createX86MachObjectWriter - Construct an X86 Mach-O object writer. +MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS, + bool Is64Bit, + uint32_t CPUType, + uint32_t CPUSubtype); + +} // End llvm namespace + + +// Defines symbolic names for X86 registers. This defines a mapping from +// register name to register number. +// +#define GET_REGINFO_ENUM +#include "X86GenRegisterInfo.inc" + +// Defines symbolic names for the X86 instructions. +// +#define GET_INSTRINFO_ENUM +#include "X86GenInstrInfo.inc" + +#define GET_SUBTARGETINFO_ENUM +#include "X86GenSubtargetInfo.inc" + +#endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp new file mode 100644 index 0000000..f0f1982 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -0,0 +1,554 @@ +//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86FixupKinds.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCValue.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Object/MachOFormat.h" + +using namespace llvm; +using namespace llvm::object; + +namespace { +class X86MachObjectWriter : public MCMachObjectTargetWriter { + void RecordScatteredRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + unsigned Log2Size, + uint64_t &FixedValue); + void RecordTLVPRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue); + + void RecordX86Relocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue); + void RecordX86_64Relocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue); +public: + X86MachObjectWriter(bool Is64Bit, uint32_t CPUType, + uint32_t CPUSubtype) + : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype, + /*UseAggressiveSymbolFolding=*/Is64Bit) {} + + void RecordRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, + MCValue Target, uint64_t &FixedValue) { + if (Writer->is64Bit()) + RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target, + FixedValue); + else + RecordX86Relocation(Writer, Asm, Layout, Fragment, Fixup, Target, + FixedValue); + } +}; +} + +static bool isFixupKindRIPRel(unsigned Kind) { + return Kind == X86::reloc_riprel_4byte || + Kind == X86::reloc_riprel_4byte_movq_load; +} + +static unsigned getFixupKindLog2Size(unsigned Kind) { + switch (Kind) { + default: + llvm_unreachable("invalid fixup kind!"); + case FK_PCRel_1: + case FK_Data_1: return 0; + case FK_PCRel_2: + case FK_Data_2: return 1; + case FK_PCRel_4: + // FIXME: Remove these!!! + case X86::reloc_riprel_4byte: + case X86::reloc_riprel_4byte_movq_load: + case X86::reloc_signed_4byte: + case FK_Data_4: return 2; + case FK_Data_8: return 3; + } +} + +void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue) { + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind()); + unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); + + // See <reloc.h>. + uint32_t FixupOffset = + Layout.getFragmentOffset(Fragment) + Fixup.getOffset(); + uint32_t FixupAddress = + Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset(); + int64_t Value = 0; + unsigned Index = 0; + unsigned IsExtern = 0; + unsigned Type = 0; + + Value = Target.getConstant(); + + if (IsPCRel) { + // Compensate for the relocation offset, Darwin x86_64 relocations only have + // the addend and appear to have attempted to define it to be the actual + // expression addend without the PCrel bias. However, instructions with data + // following the relocation are not accommodated for (see comment below + // regarding SIGNED{1,2,4}), so it isn't exactly that either. + Value += 1LL << Log2Size; + } + + if (Target.isAbsolute()) { // constant + // SymbolNum of 0 indicates the absolute section. + Type = macho::RIT_X86_64_Unsigned; + Index = 0; + + // FIXME: I believe this is broken, I don't think the linker can understand + // it. I think it would require a local relocation, but I'm not sure if that + // would work either. The official way to get an absolute PCrel relocation + // is to use an absolute symbol (which we don't support yet). + if (IsPCRel) { + IsExtern = 1; + Type = macho::RIT_X86_64_Branch; + } + } else if (Target.getSymB()) { // A - B + constant + const MCSymbol *A = &Target.getSymA()->getSymbol(); + MCSymbolData &A_SD = Asm.getSymbolData(*A); + const MCSymbolData *A_Base = Asm.getAtom(&A_SD); + + const MCSymbol *B = &Target.getSymB()->getSymbol(); + MCSymbolData &B_SD = Asm.getSymbolData(*B); + const MCSymbolData *B_Base = Asm.getAtom(&B_SD); + + // Neither symbol can be modified. + if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || + Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) + report_fatal_error("unsupported relocation of modified symbol"); + + // We don't support PCrel relocations of differences. Darwin 'as' doesn't + // implement most of these correctly. + if (IsPCRel) + report_fatal_error("unsupported pc-relative relocation of difference"); + + // The support for the situation where one or both of the symbols would + // require a local relocation is handled just like if the symbols were + // external. This is certainly used in the case of debug sections where the + // section has only temporary symbols and thus the symbols don't have base + // symbols. This is encoded using the section ordinal and non-extern + // relocation entries. + + // Darwin 'as' doesn't emit correct relocations for this (it ends up with a + // single SIGNED relocation); reject it for now. Except the case where both + // symbols don't have a base, equal but both NULL. + if (A_Base == B_Base && A_Base) + report_fatal_error("unsupported relocation with identical base"); + + Value += Writer->getSymbolAddress(&A_SD, Layout) - + (A_Base == NULL ? 0 : Writer->getSymbolAddress(A_Base, Layout)); + Value -= Writer->getSymbolAddress(&B_SD, Layout) - + (B_Base == NULL ? 0 : Writer->getSymbolAddress(B_Base, Layout)); + + if (A_Base) { + Index = A_Base->getIndex(); + IsExtern = 1; + } + else { + Index = A_SD.getFragment()->getParent()->getOrdinal() + 1; + IsExtern = 0; + } + Type = macho::RIT_X86_64_Unsigned; + + macho::RelocationEntry MRE; + MRE.Word0 = FixupOffset; + MRE.Word1 = ((Index << 0) | + (IsPCRel << 24) | + (Log2Size << 25) | + (IsExtern << 27) | + (Type << 28)); + Writer->addRelocation(Fragment->getParent(), MRE); + + if (B_Base) { + Index = B_Base->getIndex(); + IsExtern = 1; + } + else { + Index = B_SD.getFragment()->getParent()->getOrdinal() + 1; + IsExtern = 0; + } + Type = macho::RIT_X86_64_Subtractor; + } else { + const MCSymbol *Symbol = &Target.getSymA()->getSymbol(); + MCSymbolData &SD = Asm.getSymbolData(*Symbol); + const MCSymbolData *Base = Asm.getAtom(&SD); + + // Relocations inside debug sections always use local relocations when + // possible. This seems to be done because the debugger doesn't fully + // understand x86_64 relocation entries, and expects to find values that + // have already been fixed up. + if (Symbol->isInSection()) { + const MCSectionMachO &Section = static_cast<const MCSectionMachO&>( + Fragment->getParent()->getSection()); + if (Section.hasAttribute(MCSectionMachO::S_ATTR_DEBUG)) + Base = 0; + } + + // x86_64 almost always uses external relocations, except when there is no + // symbol to use as a base address (a local symbol with no preceding + // non-local symbol). + if (Base) { + Index = Base->getIndex(); + IsExtern = 1; + + // Add the local offset, if needed. + if (Base != &SD) + Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base); + } else if (Symbol->isInSection() && !Symbol->isVariable()) { + // The index is the section ordinal (1-based). + Index = SD.getFragment()->getParent()->getOrdinal() + 1; + IsExtern = 0; + Value += Writer->getSymbolAddress(&SD, Layout); + + if (IsPCRel) + Value -= FixupAddress + (1 << Log2Size); + } else if (Symbol->isVariable()) { + const MCExpr *Value = Symbol->getVariableValue(); + int64_t Res; + bool isAbs = Value->EvaluateAsAbsolute(Res, Layout, + Writer->getSectionAddressMap()); + if (isAbs) { + FixedValue = Res; + return; + } else { + report_fatal_error("unsupported relocation of variable '" + + Symbol->getName() + "'"); + } + } else { + report_fatal_error("unsupported relocation of undefined symbol '" + + Symbol->getName() + "'"); + } + + MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind(); + if (IsPCRel) { + if (IsRIPRel) { + if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) { + // x86_64 distinguishes movq foo@GOTPCREL so that the linker can + // rewrite the movq to an leaq at link time if the symbol ends up in + // the same linkage unit. + if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load) + Type = macho::RIT_X86_64_GOTLoad; + else + Type = macho::RIT_X86_64_GOT; + } else if (Modifier == MCSymbolRefExpr::VK_TLVP) { + Type = macho::RIT_X86_64_TLV; + } else if (Modifier != MCSymbolRefExpr::VK_None) { + report_fatal_error("unsupported symbol modifier in relocation"); + } else { + Type = macho::RIT_X86_64_Signed; + + // The Darwin x86_64 relocation format has a problem where it cannot + // encode an address (L<foo> + <constant>) which is outside the atom + // containing L<foo>. Generally, this shouldn't occur but it does + // happen when we have a RIPrel instruction with data following the + // relocation entry (e.g., movb $012, L0(%rip)). Even with the PCrel + // adjustment Darwin x86_64 uses, the offset is still negative and the + // linker has no way to recognize this. + // + // To work around this, Darwin uses several special relocation types + // to indicate the offsets. However, the specification or + // implementation of these seems to also be incomplete; they should + // adjust the addend as well based on the actual encoded instruction + // (the additional bias), but instead appear to just look at the final + // offset. + switch (-(Target.getConstant() + (1LL << Log2Size))) { + case 1: Type = macho::RIT_X86_64_Signed1; break; + case 2: Type = macho::RIT_X86_64_Signed2; break; + case 4: Type = macho::RIT_X86_64_Signed4; break; + } + } + } else { + if (Modifier != MCSymbolRefExpr::VK_None) + report_fatal_error("unsupported symbol modifier in branch " + "relocation"); + + Type = macho::RIT_X86_64_Branch; + } + } else { + if (Modifier == MCSymbolRefExpr::VK_GOT) { + Type = macho::RIT_X86_64_GOT; + } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) { + // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which + // case all we do is set the PCrel bit in the relocation entry; this is + // used with exception handling, for example. The source is required to + // include any necessary offset directly. + Type = macho::RIT_X86_64_GOT; + IsPCRel = 1; + } else if (Modifier == MCSymbolRefExpr::VK_TLVP) { + report_fatal_error("TLVP symbol modifier should have been rip-rel"); + } else if (Modifier != MCSymbolRefExpr::VK_None) + report_fatal_error("unsupported symbol modifier in relocation"); + else + Type = macho::RIT_X86_64_Unsigned; + } + } + + // x86_64 always writes custom values into the fixups. + FixedValue = Value; + + // struct relocation_info (8 bytes) + macho::RelocationEntry MRE; + MRE.Word0 = FixupOffset; + MRE.Word1 = ((Index << 0) | + (IsPCRel << 24) | + (Log2Size << 25) | + (IsExtern << 27) | + (Type << 28)); + Writer->addRelocation(Fragment->getParent(), MRE); +} + +void X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + unsigned Log2Size, + uint64_t &FixedValue) { + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + unsigned Type = macho::RIT_Vanilla; + + // See <reloc.h>. + const MCSymbol *A = &Target.getSymA()->getSymbol(); + MCSymbolData *A_SD = &Asm.getSymbolData(*A); + + if (!A_SD->getFragment()) + report_fatal_error("symbol '" + A->getName() + + "' can not be undefined in a subtraction expression"); + + uint32_t Value = Writer->getSymbolAddress(A_SD, Layout); + uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent()); + FixedValue += SecAddr; + uint32_t Value2 = 0; + + if (const MCSymbolRefExpr *B = Target.getSymB()) { + MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol()); + + if (!B_SD->getFragment()) + report_fatal_error("symbol '" + B->getSymbol().getName() + + "' can not be undefined in a subtraction expression"); + + // Select the appropriate difference relocation type. + // + // Note that there is no longer any semantic difference between these two + // relocation types from the linkers point of view, this is done solely for + // pedantic compatibility with 'as'. + Type = A_SD->isExternal() ? (unsigned)macho::RIT_Difference : + (unsigned)macho::RIT_Generic_LocalDifference; + Value2 = Writer->getSymbolAddress(B_SD, Layout); + FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent()); + } + + // Relocations are written out in reverse order, so the PAIR comes first. + if (Type == macho::RIT_Difference || + Type == macho::RIT_Generic_LocalDifference) { + macho::RelocationEntry MRE; + MRE.Word0 = ((0 << 0) | + (macho::RIT_Pair << 24) | + (Log2Size << 28) | + (IsPCRel << 30) | + macho::RF_Scattered); + MRE.Word1 = Value2; + Writer->addRelocation(Fragment->getParent(), MRE); + } + + macho::RelocationEntry MRE; + MRE.Word0 = ((FixupOffset << 0) | + (Type << 24) | + (Log2Size << 28) | + (IsPCRel << 30) | + macho::RF_Scattered); + MRE.Word1 = Value; + Writer->addRelocation(Fragment->getParent(), MRE); +} + +void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue) { + assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP && + !is64Bit() && + "Should only be called with a 32-bit TLVP relocation!"); + + unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); + uint32_t Value = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + unsigned IsPCRel = 0; + + // Get the symbol data. + MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol()); + unsigned Index = SD_A->getIndex(); + + // We're only going to have a second symbol in pic mode and it'll be a + // subtraction from the picbase. For 32-bit pic the addend is the difference + // between the picbase and the next address. For 32-bit static the addend is + // zero. + if (Target.getSymB()) { + // If this is a subtraction then we're pcrel. + uint32_t FixupAddress = + Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset(); + MCSymbolData *SD_B = &Asm.getSymbolData(Target.getSymB()->getSymbol()); + IsPCRel = 1; + FixedValue = (FixupAddress - Writer->getSymbolAddress(SD_B, Layout) + + Target.getConstant()); + FixedValue += 1ULL << Log2Size; + } else { + FixedValue = 0; + } + + // struct relocation_info (8 bytes) + macho::RelocationEntry MRE; + MRE.Word0 = Value; + MRE.Word1 = ((Index << 0) | + (IsPCRel << 24) | + (Log2Size << 25) | + (1 << 27) | // Extern + (macho::RIT_Generic_TLV << 28)); // Type + Writer->addRelocation(Fragment->getParent(), MRE); +} + +void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, + const MCAssembler &Asm, + const MCAsmLayout &Layout, + const MCFragment *Fragment, + const MCFixup &Fixup, + MCValue Target, + uint64_t &FixedValue) { + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); + + // If this is a 32-bit TLVP reloc it's handled a bit differently. + if (Target.getSymA() && + Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP) { + RecordTLVPRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, + FixedValue); + return; + } + + // If this is a difference or a defined symbol plus an offset, then we need a + // scattered relocation entry. Differences always require scattered + // relocations. + if (Target.getSymB()) + return RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, + Target, Log2Size, FixedValue); + + // Get the symbol data, if any. + MCSymbolData *SD = 0; + if (Target.getSymA()) + SD = &Asm.getSymbolData(Target.getSymA()->getSymbol()); + + // If this is an internal relocation with an offset, it also needs a scattered + // relocation entry. + uint32_t Offset = Target.getConstant(); + if (IsPCRel) + Offset += 1 << Log2Size; + if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD)) + return RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, + Target, Log2Size, FixedValue); + + // See <reloc.h>. + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset(); + unsigned Index = 0; + unsigned IsExtern = 0; + unsigned Type = 0; + + if (Target.isAbsolute()) { // constant + // SymbolNum of 0 indicates the absolute section. + // + // FIXME: Currently, these are never generated (see code below). I cannot + // find a case where they are actually emitted. + Type = macho::RIT_Vanilla; + } else { + // Resolve constant variables. + if (SD->getSymbol().isVariable()) { + int64_t Res; + if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute( + Res, Layout, Writer->getSectionAddressMap())) { + FixedValue = Res; + return; + } + } + + // Check whether we need an external or internal relocation. + if (Writer->doesSymbolRequireExternRelocation(SD)) { + IsExtern = 1; + Index = SD->getIndex(); + // For external relocations, make sure to offset the fixup value to + // compensate for the addend of the symbol address, if it was + // undefined. This occurs with weak definitions, for example. + if (!SD->Symbol->isUndefined()) + FixedValue -= Layout.getSymbolOffset(SD); + } else { + // The index is the section ordinal (1-based). + const MCSectionData &SymSD = Asm.getSectionData( + SD->getSymbol().getSection()); + Index = SymSD.getOrdinal() + 1; + FixedValue += Writer->getSectionAddress(&SymSD); + } + if (IsPCRel) + FixedValue -= Writer->getSectionAddress(Fragment->getParent()); + + Type = macho::RIT_Vanilla; + } + + // struct relocation_info (8 bytes) + macho::RelocationEntry MRE; + MRE.Word0 = FixupOffset; + MRE.Word1 = ((Index << 0) | + (IsPCRel << 24) | + (Log2Size << 25) | + (IsExtern << 27) | + (Type << 28)); + Writer->addRelocation(Fragment->getParent(), MRE); +} + +MCObjectWriter *llvm::createX86MachObjectWriter(raw_ostream &OS, + bool Is64Bit, + uint32_t CPUType, + uint32_t CPUSubtype) { + return createMachObjectWriter(new X86MachObjectWriter(Is64Bit, + CPUType, + CPUSubtype), + OS, /*IsLittleEndian=*/true); +} diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp new file mode 100644 index 0000000..52a67f7 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp @@ -0,0 +1,23 @@ +//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "llvm/Module.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +Target llvm::TheX86_32Target, llvm::TheX86_64Target; + +extern "C" void LLVMInitializeX86TargetInfo() { + RegisterTarget<Triple::x86, /*HasJIT=*/true> + X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above"); + + RegisterTarget<Triple::x86_64, /*HasJIT=*/true> + Y(TheX86_64Target, "x86-64", "64-bit X86: EM64T and AMD64"); +} diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp new file mode 100644 index 0000000..aeb3309 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -0,0 +1,243 @@ +//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Define several functions to decode x86 specific shuffle semantics into a +// generic vector mask. +// +//===----------------------------------------------------------------------===// + +#include "X86ShuffleDecode.h" + +//===----------------------------------------------------------------------===// +// Vector Mask Decoding +//===----------------------------------------------------------------------===// + +namespace llvm { + +void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<unsigned> &ShuffleMask) { + // Defaults the copying the dest value. + ShuffleMask.push_back(0); + ShuffleMask.push_back(1); + ShuffleMask.push_back(2); + ShuffleMask.push_back(3); + + // Decode the immediate. + unsigned ZMask = Imm & 15; + unsigned CountD = (Imm >> 4) & 3; + unsigned CountS = (Imm >> 6) & 3; + + // CountS selects which input element to use. + unsigned InVal = 4+CountS; + // CountD specifies which element of destination to update. + ShuffleMask[CountD] = InVal; + // ZMask zaps values, potentially overriding the CountD elt. + if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero; + if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero; + if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero; + if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero; +} + +// <3,1> or <6,7,2,3> +void DecodeMOVHLPSMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + for (unsigned i = NElts/2; i != NElts; ++i) + ShuffleMask.push_back(NElts+i); + + for (unsigned i = NElts/2; i != NElts; ++i) + ShuffleMask.push_back(i); +} + +// <0,2> or <0,1,4,5> +void DecodeMOVLHPSMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + for (unsigned i = 0; i != NElts/2; ++i) + ShuffleMask.push_back(i); + + for (unsigned i = 0; i != NElts/2; ++i) + ShuffleMask.push_back(NElts+i); +} + +void DecodePSHUFMask(unsigned NElts, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask) { + for (unsigned i = 0; i != NElts; ++i) { + ShuffleMask.push_back(Imm % NElts); + Imm /= NElts; + } +} + +void DecodePSHUFHWMask(unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask) { + ShuffleMask.push_back(0); + ShuffleMask.push_back(1); + ShuffleMask.push_back(2); + ShuffleMask.push_back(3); + for (unsigned i = 0; i != 4; ++i) { + ShuffleMask.push_back(4+(Imm & 3)); + Imm >>= 2; + } +} + +void DecodePSHUFLWMask(unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask) { + for (unsigned i = 0; i != 4; ++i) { + ShuffleMask.push_back((Imm & 3)); + Imm >>= 2; + } + ShuffleMask.push_back(4); + ShuffleMask.push_back(5); + ShuffleMask.push_back(6); + ShuffleMask.push_back(7); +} + +void DecodePUNPCKLBWMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i8, NElts), ShuffleMask); +} + +void DecodePUNPCKLWDMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i16, NElts), ShuffleMask); +} + +void DecodePUNPCKLDQMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i32, NElts), ShuffleMask); +} + +void DecodePUNPCKLQDQMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i64, NElts), ShuffleMask); +} + +void DecodePUNPCKLMask(EVT VT, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(VT, ShuffleMask); +} + +void DecodePUNPCKHMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + for (unsigned i = 0; i != NElts/2; ++i) { + ShuffleMask.push_back(i+NElts/2); + ShuffleMask.push_back(i+NElts+NElts/2); + } +} + +void DecodeSHUFPSMask(unsigned NElts, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask) { + // Part that reads from dest. + for (unsigned i = 0; i != NElts/2; ++i) { + ShuffleMask.push_back(Imm % NElts); + Imm /= NElts; + } + // Part that reads from src. + for (unsigned i = 0; i != NElts/2; ++i) { + ShuffleMask.push_back(Imm % NElts + NElts); + Imm /= NElts; + } +} + +void DecodeUNPCKHPMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + for (unsigned i = 0; i != NElts/2; ++i) { + ShuffleMask.push_back(i+NElts/2); // Reads from dest + ShuffleMask.push_back(i+NElts+NElts/2); // Reads from src + } +} + +void DecodeUNPCKLPSMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i32, NElts), ShuffleMask); +} + +void DecodeUNPCKLPDMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask) { + DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i64, NElts), ShuffleMask); +} + +/// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd +/// etc. VT indicates the type of the vector allowing it to handle different +/// datatypes and vector widths. +void DecodeUNPCKLPMask(EVT VT, + SmallVectorImpl<unsigned> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits() / 128; + if (NumLanes == 0 ) NumLanes = 1; // Handle MMX + unsigned NumLaneElts = NumElts / NumLanes; + + unsigned Start = 0; + unsigned End = NumLaneElts / 2; + for (unsigned s = 0; s < NumLanes; ++s) { + for (unsigned i = Start; i != End; ++i) { + ShuffleMask.push_back(i); // Reads from dest/src1 + ShuffleMask.push_back(i+NumLaneElts); // Reads from src/src2 + } + // Process the next 128 bits. + Start += NumLaneElts; + End += NumLaneElts; + } +} + +// DecodeVPERMILPSMask - Decodes VPERMILPS permutes for any 128-bit 32-bit +// elements. For 256-bit vectors, it's considered as two 128 lanes, the +// referenced elements can't cross lanes and the mask of the first lane must +// be the same of the second. +void DecodeVPERMILPSMask(unsigned NumElts, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask) { + unsigned NumLanes = (NumElts*32)/128; + unsigned LaneSize = NumElts/NumLanes; + + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = 0; i != LaneSize; ++i) { + unsigned Idx = (Imm >> (i*2)) & 0x3 ; + ShuffleMask.push_back(Idx+(l*LaneSize)); + } + } +} + +// DecodeVPERMILPDMask - Decodes VPERMILPD permutes for any 128-bit 64-bit +// elements. For 256-bit vectors, it's considered as two 128 lanes, the +// referenced elements can't cross lanes but the mask of the first lane can +// be the different of the second (not like VPERMILPS). +void DecodeVPERMILPDMask(unsigned NumElts, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask) { + unsigned NumLanes = (NumElts*64)/128; + unsigned LaneSize = NumElts/NumLanes; + + for (unsigned l = 0; l < NumLanes; ++l) { + for (unsigned i = l*LaneSize; i < LaneSize*(l+1); ++i) { + unsigned Idx = (Imm >> i) & 0x1; + ShuffleMask.push_back(Idx+(l*LaneSize)); + } + } +} + +void DecodeVPERM2F128Mask(EVT VT, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask) { + unsigned HalfSize = VT.getVectorNumElements()/2; + unsigned FstHalfBegin = (Imm & 0x3) * HalfSize; + unsigned SndHalfBegin = ((Imm >> 4) & 0x3) * HalfSize; + + for (int i = FstHalfBegin, e = FstHalfBegin+HalfSize; i != e; ++i) + ShuffleMask.push_back(i); + for (int i = SndHalfBegin, e = SndHalfBegin+HalfSize; i != e; ++i) + ShuffleMask.push_back(i); +} + +void DecodeVPERM2F128Mask(unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask) { + // VPERM2F128 is used by any 256-bit EVT, but X86InstComments only + // has information about the instruction and not the types. So for + // instruction comments purpose, assume the 256-bit vector is v4i64. + return DecodeVPERM2F128Mask(MVT::v4i64, Imm, ShuffleMask); +} + +} // llvm namespace diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h new file mode 100644 index 0000000..58193e6 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -0,0 +1,107 @@ +//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Define several functions to decode x86 specific shuffle semantics into a +// generic vector mask. +// +//===----------------------------------------------------------------------===// + +#ifndef X86_SHUFFLE_DECODE_H +#define X86_SHUFFLE_DECODE_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/ValueTypes.h" + +//===----------------------------------------------------------------------===// +// Vector Mask Decoding +//===----------------------------------------------------------------------===// + +namespace llvm { +enum { + SM_SentinelZero = ~0U +}; + +void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<unsigned> &ShuffleMask); + +// <3,1> or <6,7,2,3> +void DecodeMOVHLPSMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +// <0,2> or <0,1,4,5> +void DecodeMOVLHPSMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePSHUFMask(unsigned NElts, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePSHUFHWMask(unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePSHUFLWMask(unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKLBWMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKLWDMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKLDQMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKLQDQMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKLMask(EVT VT, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodePUNPCKHMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodeSHUFPSMask(unsigned NElts, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodeUNPCKHPMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodeUNPCKLPSMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodeUNPCKLPDMask(unsigned NElts, + SmallVectorImpl<unsigned> &ShuffleMask); + +/// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd +/// etc. VT indicates the type of the vector allowing it to handle different +/// datatypes and vector widths. +void DecodeUNPCKLPMask(EVT VT, + SmallVectorImpl<unsigned> &ShuffleMask); + + +// DecodeVPERMILPSMask - Decodes VPERMILPS permutes for any 128-bit 32-bit +// elements. For 256-bit vectors, it's considered as two 128 lanes, the +// referenced elements can't cross lanes and the mask of the first lane must +// be the same of the second. +void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask); + +// DecodeVPERMILPDMask - Decodes VPERMILPD permutes for any 128-bit 64-bit +// elements. For 256-bit vectors, it's considered as two 128 lanes, the +// referenced elements can't cross lanes but the mask of the first lane can +// be the different of the second (not like VPERMILPS). +void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask); + +void DecodeVPERM2F128Mask(unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask); +void DecodeVPERM2F128Mask(EVT VT, unsigned Imm, + SmallVectorImpl<unsigned> &ShuffleMask); + +} // llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h new file mode 100644 index 0000000..81e9422 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86.h @@ -0,0 +1,71 @@ +//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the x86 +// target library, as used by the LLVM JIT. +// +//===----------------------------------------------------------------------===// + +#ifndef TARGET_X86_H +#define TARGET_X86_H + +#include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86MCTargetDesc.h" +#include "llvm/Support/DataTypes.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class FunctionPass; +class JITCodeEmitter; +class MachineCodeEmitter; +class Target; +class X86TargetMachine; + +/// createX86ISelDag - This pass converts a legalized DAG into a +/// X86-specific DAG, ready for instruction scheduling. +/// +FunctionPass *createX86ISelDag(X86TargetMachine &TM, + CodeGenOpt::Level OptLevel); + +/// createGlobalBaseRegPass - This pass initializes a global base +/// register for PIC on x86-32. +FunctionPass* createGlobalBaseRegPass(); + +/// createX86FloatingPointStackifierPass - This function returns a pass which +/// converts floating point register references and pseudo instructions into +/// floating point stack references and physical instructions. +/// +FunctionPass *createX86FloatingPointStackifierPass(); + +/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions +/// before each call to avoid transition penalty between functions encoded with +/// AVX and SSE. +FunctionPass *createX86IssueVZeroUpperPass(); + +/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code +/// to the specified MCE object. +FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM, + JITCodeEmitter &JCE); + +/// createX86EmitCodeToMemory - Returns a pass that converts a register +/// allocated function into raw machine code in a dynamically +/// allocated chunk of memory. +/// +FunctionPass *createEmitX86CodeToMemory(); + +/// createX86MaxStackAlignmentHeuristicPass - This function returns a pass +/// which determines whether the frame pointer register should be +/// reserved in case dynamic stack alignment is later required. +/// +FunctionPass *createX86MaxStackAlignmentHeuristicPass(); + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td new file mode 100644 index 0000000..104b91f --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86.td @@ -0,0 +1,268 @@ +//===- X86.td - Target definition file for the Intel X86 ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a target description file for the Intel i386 architecture, referred +// to here as the "X86" architecture. +// +//===----------------------------------------------------------------------===// + +// Get the target-independent interfaces which we are implementing... +// +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// X86 Subtarget state. +// + +def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true", + "64-bit mode (x86_64)">; + +def ModeNaCl : SubtargetFeature<"nacl-mode", "InNaClMode", "true", + "Native Client mode">; + +//===----------------------------------------------------------------------===// +// X86 Subtarget features. +//===----------------------------------------------------------------------===// + +def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true", + "Enable conditional move instructions">; + +def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true", + "Support POPCNT instruction">; + + +def FeatureMMX : SubtargetFeature<"mmx","X86SSELevel", "MMX", + "Enable MMX instructions">; +def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1", + "Enable SSE instructions", + // SSE codegen depends on cmovs, and all + // SSE1+ processors support them. + [FeatureMMX, FeatureCMOV]>; +def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2", + "Enable SSE2 instructions", + [FeatureSSE1]>; +def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3", + "Enable SSE3 instructions", + [FeatureSSE2]>; +def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3", + "Enable SSSE3 instructions", + [FeatureSSE3]>; +def FeatureSSE41 : SubtargetFeature<"sse41", "X86SSELevel", "SSE41", + "Enable SSE 4.1 instructions", + [FeatureSSSE3]>; +def FeatureSSE42 : SubtargetFeature<"sse42", "X86SSELevel", "SSE42", + "Enable SSE 4.2 instructions", + [FeatureSSE41, FeaturePOPCNT]>; +def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", + "Enable 3DNow! instructions", + [FeatureMMX]>; +def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA", + "Enable 3DNow! Athlon instructions", + [Feature3DNow]>; +// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied +// feature, because SSE2 can be disabled (e.g. for compiling OS kernels) +// without disabling 64-bit mode. +def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true", + "Support 64-bit instructions", + [FeatureCMOV]>; +def FeatureCMPXCHG16B : SubtargetFeature<"cmpxchg16b", "HasCmpxchg16b", "true", + "64-bit with cmpxchg16b", + [Feature64Bit]>; +def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", + "Bit testing of memory is slow">; +def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem", + "IsUAMemFast", "true", + "Fast unaligned memory access">; +def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", + "Support SSE 4a instructions", + [FeaturePOPCNT]>; + +def FeatureAVX : SubtargetFeature<"avx", "HasAVX", "true", + "Enable AVX instructions">; +def FeatureCLMUL : SubtargetFeature<"clmul", "HasCLMUL", "true", + "Enable carry-less multiplication instructions">; +def FeatureFMA3 : SubtargetFeature<"fma3", "HasFMA3", "true", + "Enable three-operand fused multiple-add">; +def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true", + "Enable four-operand fused multiple-add">; +def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem", + "HasVectorUAMem", "true", + "Allow unaligned memory operands on vector/SIMD instructions">; +def FeatureAES : SubtargetFeature<"aes", "HasAES", "true", + "Enable AES instructions">; +def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true", + "Support MOVBE instruction">; +def FeatureRDRAND : SubtargetFeature<"rdrand", "HasRDRAND", "true", + "Support RDRAND instruction">; +def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true", + "Support 16-bit floating point conversion instructions">; +def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true", + "Support LZCNT instruction">; +def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true", + "Support BMI instructions">; + +//===----------------------------------------------------------------------===// +// X86 processors supported. +//===----------------------------------------------------------------------===// + +class Proc<string Name, list<SubtargetFeature> Features> + : Processor<Name, NoItineraries, Features>; + +def : Proc<"generic", []>; +def : Proc<"i386", []>; +def : Proc<"i486", []>; +def : Proc<"i586", []>; +def : Proc<"pentium", []>; +def : Proc<"pentium-mmx", [FeatureMMX]>; +def : Proc<"i686", []>; +def : Proc<"pentiumpro", [FeatureCMOV]>; +def : Proc<"pentium2", [FeatureMMX, FeatureCMOV]>; +def : Proc<"pentium3", [FeatureSSE1]>; +def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>; +def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>; +def : Proc<"pentium4", [FeatureSSE2]>; +def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>; +def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>; +def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem]>; +def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>; +def : Proc<"nocona", [FeatureSSE3, FeatureCMPXCHG16B, + FeatureSlowBTMem]>; +def : Proc<"core2", [FeatureSSSE3, FeatureCMPXCHG16B, + FeatureSlowBTMem]>; +def : Proc<"penryn", [FeatureSSE41, FeatureCMPXCHG16B, + FeatureSlowBTMem]>; +def : Proc<"atom", [FeatureSSE3, FeatureCMPXCHG16B, FeatureMOVBE, + FeatureSlowBTMem]>; +// "Arrandale" along with corei3 and corei5 +def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B, + FeatureSlowBTMem, FeatureFastUAMem, FeatureAES]>; +def : Proc<"nehalem", [FeatureSSE42, FeatureCMPXCHG16B, + FeatureSlowBTMem, FeatureFastUAMem]>; +// Westmere is a similar machine to nehalem with some additional features. +// Westmere is the corei3/i5/i7 path from nehalem to sandybridge +def : Proc<"westmere", [FeatureSSE42, FeatureCMPXCHG16B, + FeatureSlowBTMem, FeatureFastUAMem, FeatureAES, + FeatureCLMUL]>; +// Sandy Bridge +// SSE is not listed here since llvm treats AVX as a reimplementation of SSE, +// rather than a superset. +// FIXME: Disabling AVX for now since it's not ready. +def : Proc<"corei7-avx", [FeatureSSE42, FeatureCMPXCHG16B, + FeatureAES, FeatureCLMUL]>; +// Ivy Bridge +def : Proc<"core-avx-i", [FeatureSSE42, FeatureCMPXCHG16B, + FeatureAES, FeatureCLMUL, + FeatureRDRAND, FeatureF16C]>; + +// Haswell +def : Proc<"core-avx2", [FeatureSSE42, FeatureCMPXCHG16B, FeatureAES, + FeatureCLMUL, FeatureRDRAND, FeatureF16C, + FeatureFMA3, FeatureMOVBE, FeatureLZCNT, + FeatureBMI]>; + +def : Proc<"k6", [FeatureMMX]>; +def : Proc<"k6-2", [Feature3DNow]>; +def : Proc<"k6-3", [Feature3DNow]>; +def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem]>; +def : Proc<"k8", [FeatureSSE2, Feature3DNowA, Feature64Bit, + FeatureSlowBTMem]>; +def : Proc<"opteron", [FeatureSSE2, Feature3DNowA, Feature64Bit, + FeatureSlowBTMem]>; +def : Proc<"athlon64", [FeatureSSE2, Feature3DNowA, Feature64Bit, + FeatureSlowBTMem]>; +def : Proc<"athlon-fx", [FeatureSSE2, Feature3DNowA, Feature64Bit, + FeatureSlowBTMem]>; +def : Proc<"k8-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, + FeatureSlowBTMem]>; +def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, + FeatureSlowBTMem]>; +def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, + FeatureSlowBTMem]>; +def : Proc<"amdfam10", [FeatureSSE3, FeatureSSE4A, + Feature3DNowA, FeatureCMPXCHG16B, + FeatureSlowBTMem]>; +def : Proc<"barcelona", [FeatureSSE3, FeatureSSE4A, + Feature3DNowA, FeatureCMPXCHG16B, + FeatureSlowBTMem]>; +def : Proc<"istanbul", [Feature3DNowA, FeatureCMPXCHG16B, + FeatureSSE4A, Feature3DNowA]>; +def : Proc<"shanghai", [Feature3DNowA, FeatureCMPXCHG16B, FeatureSSE4A, + Feature3DNowA]>; + +def : Proc<"winchip-c6", [FeatureMMX]>; +def : Proc<"winchip2", [Feature3DNow]>; +def : Proc<"c3", [Feature3DNow]>; +def : Proc<"c3-2", [FeatureSSE1]>; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "X86RegisterInfo.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "X86InstrInfo.td" + +def X86InstrInfo : InstrInfo; + +//===----------------------------------------------------------------------===// +// Calling Conventions +//===----------------------------------------------------------------------===// + +include "X86CallingConv.td" + + +//===----------------------------------------------------------------------===// +// Assembly Parser +//===----------------------------------------------------------------------===// + +// Currently the X86 assembly parser only supports ATT syntax. +def ATTAsmParser : AsmParser { + string AsmParserClassName = "ATTAsmParser"; + int Variant = 0; + + // Discard comments in assembly strings. + string CommentDelimiter = "#"; + + // Recognize hard coded registers. + string RegisterPrefix = "%"; +} + +//===----------------------------------------------------------------------===// +// Assembly Printers +//===----------------------------------------------------------------------===// + +// The X86 target supports two different syntaxes for emitting machine code. +// This is controlled by the -x86-asm-syntax={att|intel} +def ATTAsmWriter : AsmWriter { + string AsmWriterClassName = "ATTInstPrinter"; + int Variant = 0; + bit isMCAsmWriter = 1; +} +def IntelAsmWriter : AsmWriter { + string AsmWriterClassName = "IntelInstPrinter"; + int Variant = 1; + bit isMCAsmWriter = 1; +} + +def X86 : Target { + // Information about the instructions... + let InstructionSet = X86InstrInfo; + + let AssemblyParsers = [ATTAsmParser]; + + let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter]; +} diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp new file mode 100644 index 0000000..4c3ff02 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -0,0 +1,715 @@ +//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to X86 machine code. +// +//===----------------------------------------------------------------------===// + +#include "X86AsmPrinter.h" +#include "InstPrinter/X86ATTInstPrinter.h" +#include "InstPrinter/X86IntelInstPrinter.h" +#include "X86MCInstLower.h" +#include "X86.h" +#include "X86COFFMachineModuleInfo.h" +#include "X86MachineFunctionInfo.h" +#include "X86TargetMachine.h" +#include "llvm/CallingConv.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Module.h" +#include "llvm/Type.h" +#include "llvm/Analysis/DebugInfo.h" +#include "llvm/Assembly/Writer.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Target/Mangler.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/COFF.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/ADT/SmallString.h" +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Primitive Helper Functions. +//===----------------------------------------------------------------------===// + +/// runOnMachineFunction - Emit the function body. +/// +bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { + SetupMachineFunction(MF); + + if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) { + bool Intrn = MF.getFunction()->hasInternalLinkage(); + OutStreamer.BeginCOFFSymbolDef(CurrentFnSym); + OutStreamer.EmitCOFFSymbolStorageClass(Intrn ? COFF::IMAGE_SYM_CLASS_STATIC + : COFF::IMAGE_SYM_CLASS_EXTERNAL); + OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION + << COFF::SCT_COMPLEX_TYPE_SHIFT); + OutStreamer.EndCOFFSymbolDef(); + } + + // Have common code print out the function header with linkage info etc. + EmitFunctionHeader(); + + // Emit the rest of the function body. + EmitFunctionBody(); + + // We didn't modify anything. + return false; +} + +/// printSymbolOperand - Print a raw symbol reference operand. This handles +/// jump tables, constant pools, global address and external symbols, all of +/// which print to a label with various suffixes for relocation types etc. +void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO, + raw_ostream &O) { + switch (MO.getType()) { + default: llvm_unreachable("unknown symbol type!"); + case MachineOperand::MO_JumpTableIndex: + O << *GetJTISymbol(MO.getIndex()); + break; + case MachineOperand::MO_ConstantPoolIndex: + O << *GetCPISymbol(MO.getIndex()); + printOffset(MO.getOffset(), O); + break; + case MachineOperand::MO_GlobalAddress: { + const GlobalValue *GV = MO.getGlobal(); + + MCSymbol *GVSym; + if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) + GVSym = GetSymbolWithGlobalValueBase(GV, "$stub"); + else if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY || + MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE || + MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE) + GVSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + else + GVSym = Mang->getSymbol(GV); + + // Handle dllimport linkage. + if (MO.getTargetFlags() == X86II::MO_DLLIMPORT) + GVSym = OutContext.GetOrCreateSymbol(Twine("__imp_") + GVSym->getName()); + + if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY || + MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) { + MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym); + if (StubSym.getPointer() == 0) + StubSym = MachineModuleInfoImpl:: + StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); + } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){ + MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(Sym); + if (StubSym.getPointer() == 0) + StubSym = MachineModuleInfoImpl:: + StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); + } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) { + MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$stub"); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym); + if (StubSym.getPointer() == 0) + StubSym = MachineModuleInfoImpl:: + StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage()); + } + + // If the name begins with a dollar-sign, enclose it in parens. We do this + // to avoid having it look like an integer immediate to the assembler. + if (GVSym->getName()[0] != '$') + O << *GVSym; + else + O << '(' << *GVSym << ')'; + printOffset(MO.getOffset(), O); + break; + } + case MachineOperand::MO_ExternalSymbol: { + const MCSymbol *SymToPrint; + if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) { + SmallString<128> TempNameStr; + TempNameStr += StringRef(MO.getSymbolName()); + TempNameStr += StringRef("$stub"); + + MCSymbol *Sym = GetExternalSymbolSymbol(TempNameStr.str()); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym); + if (StubSym.getPointer() == 0) { + TempNameStr.erase(TempNameStr.end()-5, TempNameStr.end()); + StubSym = MachineModuleInfoImpl:: + StubValueTy(OutContext.GetOrCreateSymbol(TempNameStr.str()), + true); + } + SymToPrint = StubSym.getPointer(); + } else { + SymToPrint = GetExternalSymbolSymbol(MO.getSymbolName()); + } + + // If the name begins with a dollar-sign, enclose it in parens. We do this + // to avoid having it look like an integer immediate to the assembler. + if (SymToPrint->getName()[0] != '$') + O << *SymToPrint; + else + O << '(' << *SymToPrint << '('; + break; + } + } + + switch (MO.getTargetFlags()) { + default: + llvm_unreachable("Unknown target flag on GV operand"); + case X86II::MO_NO_FLAG: // No flag. + break; + case X86II::MO_DARWIN_NONLAZY: + case X86II::MO_DLLIMPORT: + case X86II::MO_DARWIN_STUB: + // These affect the name of the symbol, not any suffix. + break; + case X86II::MO_GOT_ABSOLUTE_ADDRESS: + O << " + [.-" << *MF->getPICBaseSymbol() << ']'; + break; + case X86II::MO_PIC_BASE_OFFSET: + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: + O << '-' << *MF->getPICBaseSymbol(); + break; + case X86II::MO_TLSGD: O << "@TLSGD"; break; + case X86II::MO_GOTTPOFF: O << "@GOTTPOFF"; break; + case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break; + case X86II::MO_TPOFF: O << "@TPOFF"; break; + case X86II::MO_NTPOFF: O << "@NTPOFF"; break; + case X86II::MO_GOTPCREL: O << "@GOTPCREL"; break; + case X86II::MO_GOT: O << "@GOT"; break; + case X86II::MO_GOTOFF: O << "@GOTOFF"; break; + case X86II::MO_PLT: O << "@PLT"; break; + case X86II::MO_TLVP: O << "@TLVP"; break; + case X86II::MO_TLVP_PIC_BASE: + O << "@TLVP" << '-' << *MF->getPICBaseSymbol(); + break; + } +} + +/// print_pcrel_imm - This is used to print an immediate value that ends up +/// being encoded as a pc-relative value. These print slightly differently, for +/// example, a $ is not emitted. +void X86AsmPrinter::print_pcrel_imm(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O) { + const MachineOperand &MO = MI->getOperand(OpNo); + switch (MO.getType()) { + default: llvm_unreachable("Unknown pcrel immediate operand"); + case MachineOperand::MO_Register: + // pc-relativeness was handled when computing the value in the reg. + printOperand(MI, OpNo, O); + return; + case MachineOperand::MO_Immediate: + O << MO.getImm(); + return; + case MachineOperand::MO_MachineBasicBlock: + O << *MO.getMBB()->getSymbol(); + return; + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + printSymbolOperand(MO, O); + return; + } +} + + +void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, + raw_ostream &O, const char *Modifier) { + const MachineOperand &MO = MI->getOperand(OpNo); + switch (MO.getType()) { + default: llvm_unreachable("unknown operand type!"); + case MachineOperand::MO_Register: { + O << '%'; + unsigned Reg = MO.getReg(); + if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) { + EVT VT = (strcmp(Modifier+6,"64") == 0) ? + MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 : + ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8)); + Reg = getX86SubSuperRegister(Reg, VT); + } + O << X86ATTInstPrinter::getRegisterName(Reg); + return; + } + + case MachineOperand::MO_Immediate: + O << '$' << MO.getImm(); + return; + + case MachineOperand::MO_JumpTableIndex: + case MachineOperand::MO_ConstantPoolIndex: + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: { + O << '$'; + printSymbolOperand(MO, O); + break; + } + } +} + +void X86AsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op, + raw_ostream &O) { + unsigned char value = MI->getOperand(Op).getImm(); + assert(value <= 7 && "Invalid ssecc argument!"); + switch (value) { + case 0: O << "eq"; break; + case 1: O << "lt"; break; + case 2: O << "le"; break; + case 3: O << "unord"; break; + case 4: O << "neq"; break; + case 5: O << "nlt"; break; + case 6: O << "nle"; break; + case 7: O << "ord"; break; + } +} + +void X86AsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op, + raw_ostream &O, const char *Modifier) { + const MachineOperand &BaseReg = MI->getOperand(Op); + const MachineOperand &IndexReg = MI->getOperand(Op+2); + const MachineOperand &DispSpec = MI->getOperand(Op+3); + + // If we really don't want to print out (rip), don't. + bool HasBaseReg = BaseReg.getReg() != 0; + if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") && + BaseReg.getReg() == X86::RIP) + HasBaseReg = false; + + // HasParenPart - True if we will print out the () part of the mem ref. + bool HasParenPart = IndexReg.getReg() || HasBaseReg; + + if (DispSpec.isImm()) { + int DispVal = DispSpec.getImm(); + if (DispVal || !HasParenPart) + O << DispVal; + } else { + assert(DispSpec.isGlobal() || DispSpec.isCPI() || + DispSpec.isJTI() || DispSpec.isSymbol()); + printSymbolOperand(MI->getOperand(Op+3), O); + } + + if (Modifier && strcmp(Modifier, "H") == 0) + O << "+8"; + + if (HasParenPart) { + assert(IndexReg.getReg() != X86::ESP && + "X86 doesn't allow scaling by ESP"); + + O << '('; + if (HasBaseReg) + printOperand(MI, Op, O, Modifier); + + if (IndexReg.getReg()) { + O << ','; + printOperand(MI, Op+2, O, Modifier); + unsigned ScaleVal = MI->getOperand(Op+1).getImm(); + if (ScaleVal != 1) + O << ',' << ScaleVal; + } + O << ')'; + } +} + +void X86AsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op, + raw_ostream &O, const char *Modifier) { + assert(isMem(MI, Op) && "Invalid memory reference!"); + const MachineOperand &Segment = MI->getOperand(Op+4); + if (Segment.getReg()) { + printOperand(MI, Op+4, O, Modifier); + O << ':'; + } + printLeaMemReference(MI, Op, O, Modifier); +} + +void X86AsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op, + raw_ostream &O) { + O << *MF->getPICBaseSymbol() << '\n'; + O << *MF->getPICBaseSymbol() << ':'; +} + +bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode, + raw_ostream &O) { + unsigned Reg = MO.getReg(); + switch (Mode) { + default: return true; // Unknown mode. + case 'b': // Print QImode register + Reg = getX86SubSuperRegister(Reg, MVT::i8); + break; + case 'h': // Print QImode high register + Reg = getX86SubSuperRegister(Reg, MVT::i8, true); + break; + case 'w': // Print HImode register + Reg = getX86SubSuperRegister(Reg, MVT::i16); + break; + case 'k': // Print SImode register + Reg = getX86SubSuperRegister(Reg, MVT::i32); + break; + case 'q': // Print DImode register + Reg = getX86SubSuperRegister(Reg, MVT::i64); + break; + } + + O << '%' << X86ATTInstPrinter::getRegisterName(Reg); + return false; +} + +/// PrintAsmOperand - Print out an operand for an inline asm expression. +/// +bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode, raw_ostream &O) { + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + const MachineOperand &MO = MI->getOperand(OpNo); + + switch (ExtraCode[0]) { + default: return true; // Unknown modifier. + case 'a': // This is an address. Currently only 'i' and 'r' are expected. + if (MO.isImm()) { + O << MO.getImm(); + return false; + } + if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol()) { + printSymbolOperand(MO, O); + if (Subtarget->isPICStyleRIPRel()) + O << "(%rip)"; + return false; + } + if (MO.isReg()) { + O << '('; + printOperand(MI, OpNo, O); + O << ')'; + return false; + } + return true; + + case 'c': // Don't print "$" before a global var name or constant. + if (MO.isImm()) + O << MO.getImm(); + else if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol()) + printSymbolOperand(MO, O); + else + printOperand(MI, OpNo, O); + return false; + + case 'A': // Print '*' before a register (it must be a register) + if (MO.isReg()) { + O << '*'; + printOperand(MI, OpNo, O); + return false; + } + return true; + + case 'b': // Print QImode register + case 'h': // Print QImode high register + case 'w': // Print HImode register + case 'k': // Print SImode register + case 'q': // Print DImode register + if (MO.isReg()) + return printAsmMRegister(MO, ExtraCode[0], O); + printOperand(MI, OpNo, O); + return false; + + case 'P': // This is the operand of a call, treat specially. + print_pcrel_imm(MI, OpNo, O); + return false; + + case 'n': // Negate the immediate or print a '-' before the operand. + // Note: this is a temporary solution. It should be handled target + // independently as part of the 'MC' work. + if (MO.isImm()) { + O << -MO.getImm(); + return false; + } + O << '-'; + } + } + + printOperand(MI, OpNo, O); + return false; +} + +bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNo, unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &O) { + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: return true; // Unknown modifier. + case 'b': // Print QImode register + case 'h': // Print QImode high register + case 'w': // Print HImode register + case 'k': // Print SImode register + case 'q': // Print SImode register + // These only apply to registers, ignore on mem. + break; + case 'H': + printMemReference(MI, OpNo, O, "H"); + return false; + case 'P': // Don't print @PLT, but do print as memory. + printMemReference(MI, OpNo, O, "no-rip"); + return false; + } + } + printMemReference(MI, OpNo, O); + return false; +} + +void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { + if (Subtarget->isTargetEnvMacho()) + OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); +} + + +void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { + if (Subtarget->isTargetEnvMacho()) { + // All darwin targets use mach-o. + MachineModuleInfoMachO &MMIMacho = + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + + // Output stubs for dynamically-linked functions. + MachineModuleInfoMachO::SymbolListTy Stubs; + + Stubs = MMIMacho.GetFnStubList(); + if (!Stubs.empty()) { + const MCSection *TheSection = + OutContext.getMachOSection("__IMPORT", "__jump_table", + MCSectionMachO::S_SYMBOL_STUBS | + MCSectionMachO::S_ATTR_SELF_MODIFYING_CODE | + MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS, + 5, SectionKind::getMetadata()); + OutStreamer.SwitchSection(TheSection); + + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + // L_foo$stub: + OutStreamer.EmitLabel(Stubs[i].first); + // .indirect_symbol _foo + OutStreamer.EmitSymbolAttribute(Stubs[i].second.getPointer(), + MCSA_IndirectSymbol); + // hlt; hlt; hlt; hlt; hlt hlt = 0xf4. + const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4"; + OutStreamer.EmitBytes(StringRef(HltInsts, 5), 0/*addrspace*/); + } + + Stubs.clear(); + OutStreamer.AddBlankLine(); + } + + // Output stubs for external and common global variables. + Stubs = MMIMacho.GetGVStubList(); + if (!Stubs.empty()) { + const MCSection *TheSection = + OutContext.getMachOSection("__IMPORT", "__pointers", + MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS, + SectionKind::getMetadata()); + OutStreamer.SwitchSection(TheSection); + + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + // L_foo$non_lazy_ptr: + OutStreamer.EmitLabel(Stubs[i].first); + // .indirect_symbol _foo + MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second; + OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), + MCSA_IndirectSymbol); + // .long 0 + if (MCSym.getInt()) + // External to current translation unit. + OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/); + else + // Internal to current translation unit. + // + // When we place the LSDA into the TEXT section, the type info + // pointers need to be indirect and pc-rel. We accomplish this by + // using NLPs. However, sometimes the types are local to the file. So + // we need to fill in the value for the NLP in those cases. + OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(), + OutContext), + 4/*size*/, 0/*addrspace*/); + } + Stubs.clear(); + OutStreamer.AddBlankLine(); + } + + Stubs = MMIMacho.GetHiddenGVStubList(); + if (!Stubs.empty()) { + OutStreamer.SwitchSection(getObjFileLowering().getDataSection()); + EmitAlignment(2); + + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + // L_foo$non_lazy_ptr: + OutStreamer.EmitLabel(Stubs[i].first); + // .long _foo + OutStreamer.EmitValue(MCSymbolRefExpr:: + Create(Stubs[i].second.getPointer(), + OutContext), + 4/*size*/, 0/*addrspace*/); + } + Stubs.clear(); + OutStreamer.AddBlankLine(); + } + + // Funny Darwin hack: This flag tells the linker that no global symbols + // contain code that falls through to other global symbols (e.g. the obvious + // implementation of multiple entry points). If this doesn't occur, the + // linker can safely perform dead code stripping. Since LLVM never + // generates code that does this, it is always safe to set. + OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); + } + + if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing() && + MMI->callsExternalVAFunctionWithFloatingPointArguments()) { + StringRef SymbolName = Subtarget->is64Bit() ? "_fltused" : "__fltused"; + MCSymbol *S = MMI->getContext().GetOrCreateSymbol(SymbolName); + OutStreamer.EmitSymbolAttribute(S, MCSA_Global); + } + + if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) { + X86COFFMachineModuleInfo &COFFMMI = + MMI->getObjFileInfo<X86COFFMachineModuleInfo>(); + + // Emit type information for external functions + typedef X86COFFMachineModuleInfo::externals_iterator externals_iterator; + for (externals_iterator I = COFFMMI.externals_begin(), + E = COFFMMI.externals_end(); + I != E; ++I) { + OutStreamer.BeginCOFFSymbolDef(CurrentFnSym); + OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_EXTERNAL); + OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION + << COFF::SCT_COMPLEX_TYPE_SHIFT); + OutStreamer.EndCOFFSymbolDef(); + } + + // Necessary for dllexport support + std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals; + + const TargetLoweringObjectFileCOFF &TLOFCOFF = + static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering()); + + for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I) + if (I->hasDLLExportLinkage()) + DLLExportedFns.push_back(Mang->getSymbol(I)); + + for (Module::const_global_iterator I = M.global_begin(), + E = M.global_end(); I != E; ++I) + if (I->hasDLLExportLinkage()) + DLLExportedGlobals.push_back(Mang->getSymbol(I)); + + // Output linker support code for dllexported globals on windows. + if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) { + OutStreamer.SwitchSection(TLOFCOFF.getDrectveSection()); + SmallString<128> name; + for (unsigned i = 0, e = DLLExportedGlobals.size(); i != e; ++i) { + if (Subtarget->isTargetWindows()) + name = " /EXPORT:"; + else + name = " -export:"; + name += DLLExportedGlobals[i]->getName(); + if (Subtarget->isTargetWindows()) + name += ",DATA"; + else + name += ",data"; + OutStreamer.EmitBytes(name, 0); + } + + for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i) { + if (Subtarget->isTargetWindows()) + name = " /EXPORT:"; + else + name = " -export:"; + name += DLLExportedFns[i]->getName(); + OutStreamer.EmitBytes(name, 0); + } + } + } + + if (Subtarget->isTargetELF()) { + const TargetLoweringObjectFileELF &TLOFELF = + static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering()); + + MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>(); + + // Output stubs for external and common global variables. + MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList(); + if (!Stubs.empty()) { + OutStreamer.SwitchSection(TLOFELF.getDataRelSection()); + const TargetData *TD = TM.getTargetData(); + + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + OutStreamer.EmitLabel(Stubs[i].first); + OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(), + TD->getPointerSize(), 0); + } + Stubs.clear(); + } + } +} + +MachineLocation +X86AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const { + MachineLocation Location; + assert (MI->getNumOperands() == 7 && "Invalid no. of machine operands!"); + // Frame address. Currently handles register +- offset only. + + if (MI->getOperand(0).isReg() && MI->getOperand(3).isImm()) + Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm()); + else { + DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); + } + return Location; +} + +void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, + raw_ostream &O) { + // Only the target-dependent form of DBG_VALUE should get here. + // Referencing the offset and metadata as NOps-2 and NOps-1 is + // probably portable to other targets; frame pointer location is not. + unsigned NOps = MI->getNumOperands(); + assert(NOps==7); + O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; + // cast away const; DIetc do not take const operands for some reason. + DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata())); + if (V.getContext().isSubprogram()) + O << DISubprogram(V.getContext()).getDisplayName() << ":"; + O << V.getName(); + O << " <- "; + // Frame address. Currently handles register +- offset only. + O << '['; + if (MI->getOperand(0).isReg() && MI->getOperand(0).getReg()) + printOperand(MI, 0, O); + else + O << "undef"; + O << '+'; printOperand(MI, 3, O); + O << ']'; + O << "+"; + printOperand(MI, NOps-2, O); +} + + + +//===----------------------------------------------------------------------===// +// Target Registry Stuff +//===----------------------------------------------------------------------===// + +// Force static initialization. +extern "C" void LLVMInitializeX86AsmPrinter() { + RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target); + RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target); +} diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h new file mode 100644 index 0000000..3a50435 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h @@ -0,0 +1,87 @@ +//===-- X86AsmPrinter.h - Convert X86 LLVM code to assembly -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// AT&T assembly code printer class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86ASMPRINTER_H +#define X86ASMPRINTER_H + +#include "X86.h" +#include "X86MachineFunctionInfo.h" +#include "X86TargetMachine.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/Support/Compiler.h" + +namespace llvm { + +class MachineJumpTableInfo; +class MCContext; +class MCInst; +class MCStreamer; +class MCSymbol; + +class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { + const X86Subtarget *Subtarget; + public: + explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer) + : AsmPrinter(TM, Streamer) { + Subtarget = &TM.getSubtarget<X86Subtarget>(); + } + + virtual const char *getPassName() const { + return "X86 AT&T-Style Assembly Printer"; + } + + const X86Subtarget &getSubtarget() const { return *Subtarget; } + + virtual void EmitStartOfAsmFile(Module &M); + + virtual void EmitEndOfAsmFile(Module &M); + + virtual void EmitInstruction(const MachineInstr *MI); + + void printSymbolOperand(const MachineOperand &MO, raw_ostream &O); + + // These methods are used by the tablegen'erated instruction printer. + void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O, + const char *Modifier = 0); + void print_pcrel_imm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); + + bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O); + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &OS); + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &OS); + + void printMachineInstruction(const MachineInstr *MI); + void printSSECC(const MachineInstr *MI, unsigned Op, raw_ostream &O); + void printMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O, + const char *Modifier=NULL); + void printLeaMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O, + const char *Modifier=NULL); + + void printPICLabel(const MachineInstr *MI, unsigned Op, raw_ostream &O); + + bool runOnMachineFunction(MachineFunction &F); + + void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); + + MachineLocation getDebugValueLocation(const MachineInstr *MI) const; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp new file mode 100644 index 0000000..4326814 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp @@ -0,0 +1,20 @@ +//===-- llvm/CodeGen/X86COFFMachineModuleInfo.cpp -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is an MMI implementation for X86 COFF (windows) targets. +// +//===----------------------------------------------------------------------===// + +#include "X86COFFMachineModuleInfo.h" +using namespace llvm; + + +X86COFFMachineModuleInfo::~X86COFFMachineModuleInfo() { +} + diff --git a/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h new file mode 100644 index 0000000..98ab2a6 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h @@ -0,0 +1,46 @@ +//===-- llvm/CodeGen/X86COFFMachineModuleInfo.h -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is an MMI implementation for X86 COFF (windows) targets. +// +//===----------------------------------------------------------------------===// + +#ifndef X86COFF_MACHINEMODULEINFO_H +#define X86COFF_MACHINEMODULEINFO_H + +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/ADT/DenseSet.h" +#include "X86MachineFunctionInfo.h" + +namespace llvm { + class X86MachineFunctionInfo; + class TargetData; + +/// X86COFFMachineModuleInfo - This is a MachineModuleInfoImpl implementation +/// for X86 COFF targets. +class X86COFFMachineModuleInfo : public MachineModuleInfoImpl { + DenseSet<MCSymbol const *> Externals; +public: + X86COFFMachineModuleInfo(const MachineModuleInfo &) {} + virtual ~X86COFFMachineModuleInfo(); + + void addExternalFunction(MCSymbol* Symbol) { + Externals.insert(Symbol); + } + + typedef DenseSet<MCSymbol const *>::const_iterator externals_iterator; + externals_iterator externals_begin() const { return Externals.begin(); } + externals_iterator externals_end() const { return Externals.end(); } +}; + + + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td new file mode 100644 index 0000000..77b9905 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td @@ -0,0 +1,401 @@ +//===- X86CallingConv.td - Calling Conventions X86 32/64 ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for the X86-32 and X86-64 +// architectures. +// +//===----------------------------------------------------------------------===// + +/// CCIfSubtarget - Match if the current subtarget has a feature F. +class CCIfSubtarget<string F, CCAction A> + : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>; + +//===----------------------------------------------------------------------===// +// Return Value Calling Conventions +//===----------------------------------------------------------------------===// + +// Return-value conventions common to all X86 CC's. +def RetCC_X86Common : CallingConv<[ + // Scalar values are returned in AX first, then DX. For i8, the ABI + // requires the values to be in AL and AH, however this code uses AL and DL + // instead. This is because using AH for the second register conflicts with + // the way LLVM does multiple return values -- a return of {i16,i8} would end + // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI + // for functions that return two i8 values are currently expected to pack the + // values into an i16 (which uses AX, and thus AL:AH). + CCIfType<[i8] , CCAssignToReg<[AL, DL]>>, + CCIfType<[i16], CCAssignToReg<[AX, DX]>>, + CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>, + CCIfType<[i64], CCAssignToReg<[RAX, RDX]>>, + + // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3 + // can only be used by ABI non-compliant code. If the target doesn't have XMM + // registers, it won't have vector types. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, + + // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3 + // can only be used by ABI non-compliant code. This vector type is only + // supported while using the AVX target feature. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + + // MMX vector types are always returned in MM0. If the target doesn't have + // MM0, it doesn't support these vector types. + CCIfType<[x86mmx], CCAssignToReg<[MM0]>>, + + // Long double types are always returned in ST0 (even with SSE). + CCIfType<[f80], CCAssignToReg<[ST0, ST1]>> +]>; + +// X86-32 C return-value convention. +def RetCC_X86_32_C : CallingConv<[ + // The X86-32 calling convention returns FP values in ST0, unless marked + // with "inreg" (used here to distinguish one kind of reg from another, + // weirdly; this is really the sse-regparm calling convention) in which + // case they use XMM0, otherwise it is the same as the common X86 calling + // conv. + CCIfInReg<CCIfSubtarget<"hasXMMInt()", + CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>, + CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>, + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-32 FastCC return-value convention. +def RetCC_X86_32_Fast : CallingConv<[ + // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has + // SSE2. + // This can happen when a float, 2 x float, or 3 x float vector is split by + // target lowering, and is returned in 1-3 sse regs. + CCIfType<[f32], CCIfSubtarget<"hasXMMInt()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, + CCIfType<[f64], CCIfSubtarget<"hasXMMInt()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, + + // For integers, ECX can be used as an extra return register + CCIfType<[i8], CCAssignToReg<[AL, DL, CL]>>, + CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>, + CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>, + + // Otherwise, it is the same as the common X86 calling convention. + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-64 C return-value convention. +def RetCC_X86_64_C : CallingConv<[ + // The X86-64 calling convention always returns FP values in XMM0. + CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>, + CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>, + + // MMX vector types are always returned in XMM0. + CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>, + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-Win64 C return-value convention. +def RetCC_X86_Win64_C : CallingConv<[ + // The X86-Win64 calling convention always returns __m64 values in RAX. + CCIfType<[x86mmx], CCBitConvertToType<i64>>, + + // Otherwise, everything is the same as 'normal' X86-64 C CC. + CCDelegateTo<RetCC_X86_64_C> +]>; + + +// This is the root return-value convention for the X86-32 backend. +def RetCC_X86_32 : CallingConv<[ + // If FastCC, use RetCC_X86_32_Fast. + CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>, + // Otherwise, use RetCC_X86_32_C. + CCDelegateTo<RetCC_X86_32_C> +]>; + +// This is the root return-value convention for the X86-64 backend. +def RetCC_X86_64 : CallingConv<[ + // Mingw64 and native Win64 use Win64 CC + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>, + + // Otherwise, drop to normal X86-64 CC + CCDelegateTo<RetCC_X86_64_C> +]>; + +// This is the return-value convention used for the entire X86 backend. +def RetCC_X86 : CallingConv<[ + CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>, + CCDelegateTo<RetCC_X86_32> +]>; + +//===----------------------------------------------------------------------===// +// X86-64 Argument Calling Conventions +//===----------------------------------------------------------------------===// + +def CC_X86_64_C : CallingConv<[ + // Handles byval parameters. + CCIfByVal<CCPassByVal<8, 8>>, + + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in R10. + CCIfNest<CCAssignToReg<[R10]>>, + + // The first 6 integer arguments are passed in integer registers. + CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>, + CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>, + + // The first 8 MMX vector arguments are passed in XMM registers on Darwin. + CCIfType<[x86mmx], + CCIfSubtarget<"isTargetDarwin()", + CCIfSubtarget<"hasXMMInt()", + CCPromoteToType<v2i64>>>>, + + // The first 8 FP/Vector arguments are passed in XMM registers. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfSubtarget<"hasXMM()", + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, + + // The first 8 256-bit vector arguments are passed in YMM registers. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasAVX()", + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7]>>>, + + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, + + // Long doubles get stack slots whose size and alignment depends on the + // subtarget. + CCIfType<[f80], CCAssignToStack<0, 0>>, + + // Vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + + // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToStack<32, 32>> +]>; + +// Calling convention used on Win64 +def CC_X86_Win64_C : CallingConv<[ + // FIXME: Handle byval stuff. + // FIXME: Handle varargs. + + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in R10. + CCIfNest<CCAssignToReg<[R10]>>, + + // 128 bit vectors are passed by pointer + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>, + + // The first 4 MMX vector arguments are passed in GPRs. + CCIfType<[x86mmx], CCBitConvertToType<i64>>, + + // The first 4 integer arguments are passed in integer registers. + CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ], + [XMM0, XMM1, XMM2, XMM3]>>, + + // Do not pass the sret argument in RCX, the Win64 thiscall calling + // convention requires "this" to be passed in RCX. + CCIfCC<"CallingConv::X86_ThisCall", + CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8 , R9 ], + [XMM1, XMM2, XMM3]>>>>, + + CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ], + [XMM0, XMM1, XMM2, XMM3]>>, + + // The first 4 FP/Vector arguments are passed in XMM registers. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3], + [RCX , RDX , R8 , R9 ]>>, + + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, + + // Long doubles get stack slots whose size and alignment depends on the + // subtarget. + CCIfType<[f80], CCAssignToStack<0, 0>> +]>; + +def CC_X86_64_GHC : CallingConv<[ + // Promote i8/i16/i32 arguments to i64. + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim + CCIfType<[i64], + CCAssignToReg<[R13, RBP, R12, RBX, R14, RSI, RDI, R8, R9, R15]>>, + + // Pass in STG registers: F1, F2, F3, F4, D1, D2 + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfSubtarget<"hasXMM()", + CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>> +]>; + +//===----------------------------------------------------------------------===// +// X86 C Calling Convention +//===----------------------------------------------------------------------===// + +/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP +/// values are spilled on the stack, and the first 4 vector values go in XMM +/// regs. +def CC_X86_32_Common : CallingConv<[ + // Handles byval parameters. + CCIfByVal<CCPassByVal<4, 4>>, + + // The first 3 float or double arguments, if marked 'inreg' and if the call + // is not a vararg call and if SSE2 is available, are passed in SSE registers. + CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64], + CCIfSubtarget<"hasXMMInt()", + CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>, + + // The first 3 __m64 vector arguments are passed in mmx registers if the + // call is not a vararg call. + CCIfNotVarArg<CCIfType<[x86mmx], + CCAssignToReg<[MM0, MM1, MM2]>>>, + + // Integer/Float values get stored in stack slots that are 4 bytes in + // size and 4-byte aligned. + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + + // Doubles get 8-byte slots that are 4-byte aligned. + CCIfType<[f64], CCAssignToStack<8, 4>>, + + // Long doubles get slots whose size depends on the subtarget. + CCIfType<[f80], CCAssignToStack<0, 4>>, + + // The first 4 SSE vector arguments are passed in XMM registers. + CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>, + + // The first 4 AVX 256-bit vector arguments are passed in YMM registers. + CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasAVX()", + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>, + + // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + + // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToStack<32, 32>>, + + // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are + // passed in the parameter area. + CCIfType<[x86mmx], CCAssignToStack<8, 4>>]>; + +def CC_X86_32_C : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in ECX. + CCIfNest<CCAssignToReg<[ECX]>>, + + // The first 3 integer arguments, if marked 'inreg' and if the call is not + // a vararg call, are passed in integer registers. + CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_FastCall : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in EAX. + CCIfNest<CCAssignToReg<[EAX]>>, + + // The first 2 integer arguments are passed in ECX/EDX + CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_ThisCall : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in EAX. + CCIfNest<CCAssignToReg<[EAX]>>, + + // The first integer argument is passed in ECX + CCIfType<[i32], CCAssignToReg<[ECX]>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_FastCC : CallingConv<[ + // Handles byval parameters. Note that we can't rely on the delegation + // to CC_X86_32_Common for this because that happens after code that + // puts arguments in registers. + CCIfByVal<CCPassByVal<4, 4>>, + + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in EAX. + CCIfNest<CCAssignToReg<[EAX]>>, + + // The first 2 integer arguments are passed in ECX/EDX + CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>, + + // The first 3 float or double arguments, if the call is not a vararg + // call and if SSE2 is available, are passed in SSE registers. + CCIfNotVarArg<CCIfType<[f32,f64], + CCIfSubtarget<"hasXMMInt()", + CCAssignToReg<[XMM0,XMM1,XMM2]>>>>, + + // Doubles get 8-byte slots that are 8-byte aligned. + CCIfType<[f64], CCAssignToStack<8, 8>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_GHC : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Pass in STG registers: Base, Sp, Hp, R1 + CCIfType<[i32], CCAssignToReg<[EBX, EBP, EDI, ESI]>> +]>; + +//===----------------------------------------------------------------------===// +// X86 Root Argument Calling Conventions +//===----------------------------------------------------------------------===// + +// This is the root argument convention for the X86-32 backend. +def CC_X86_32 : CallingConv<[ + CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>, + CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>, + CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>, + CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>, + + // Otherwise, drop to normal X86-32 CC + CCDelegateTo<CC_X86_32_C> +]>; + +// This is the root argument convention for the X86-64 backend. +def CC_X86_64 : CallingConv<[ + CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>, + + // Mingw64 and native Win64 use Win64 CC + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, + + // Otherwise, drop to normal X86-64 CC + CCDelegateTo<CC_X86_64_C> +]>; + +// This is the argument convention used for the entire X86 backend. +def CC_X86 : CallingConv<[ + CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>, + CCDelegateTo<CC_X86_32> +]>; diff --git a/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp new file mode 100644 index 0000000..aeff03a --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp @@ -0,0 +1,999 @@ +//===-- X86/X86CodeEmitter.cpp - Convert X86 code to machine code ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the pass that transforms the X86 machine instructions into +// relocatable machine code. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-emitter" +#include "X86InstrInfo.h" +#include "X86JITInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "X86Relocations.h" +#include "X86.h" +#include "llvm/LLVMContext.h" +#include "llvm/PassManager.h" +#include "llvm/CodeGen/JITCodeEmitter.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Function.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +STATISTIC(NumEmitted, "Number of machine instructions emitted"); + +namespace { + template<class CodeEmitter> + class Emitter : public MachineFunctionPass { + const X86InstrInfo *II; + const TargetData *TD; + X86TargetMachine &TM; + CodeEmitter &MCE; + MachineModuleInfo *MMI; + intptr_t PICBaseOffset; + bool Is64BitMode; + bool IsPIC; + public: + static char ID; + explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce) + : MachineFunctionPass(ID), II(0), TD(0), TM(tm), + MCE(mce), PICBaseOffset(0), Is64BitMode(false), + IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} + Emitter(X86TargetMachine &tm, CodeEmitter &mce, + const X86InstrInfo &ii, const TargetData &td, bool is64) + : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm), + MCE(mce), PICBaseOffset(0), Is64BitMode(is64), + IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} + + bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "X86 Machine Code Emitter"; + } + + void emitInstruction(MachineInstr &MI, const MCInstrDesc *Desc); + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<MachineModuleInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + private: + void emitPCRelativeBlockAddress(MachineBasicBlock *MBB); + void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc, + intptr_t Disp = 0, intptr_t PCAdj = 0, + bool Indirect = false); + void emitExternalSymbolAddress(const char *ES, unsigned Reloc); + void emitConstPoolAddress(unsigned CPI, unsigned Reloc, intptr_t Disp = 0, + intptr_t PCAdj = 0); + void emitJumpTableAddress(unsigned JTI, unsigned Reloc, + intptr_t PCAdj = 0); + + void emitDisplacementField(const MachineOperand *RelocOp, int DispVal, + intptr_t Adj = 0, bool IsPCRel = true); + + void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField); + void emitRegModRMByte(unsigned RegOpcodeField); + void emitSIBByte(unsigned SS, unsigned Index, unsigned Base); + void emitConstant(uint64_t Val, unsigned Size); + + void emitMemModRMByte(const MachineInstr &MI, + unsigned Op, unsigned RegOpcodeField, + intptr_t PCAdj = 0); + }; + +template<class CodeEmitter> + char Emitter<CodeEmitter>::ID = 0; +} // end anonymous namespace. + +/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code +/// to the specified templated MachineCodeEmitter object. +FunctionPass *llvm::createX86JITCodeEmitterPass(X86TargetMachine &TM, + JITCodeEmitter &JCE) { + return new Emitter<JITCodeEmitter>(TM, JCE); +} + +template<class CodeEmitter> +bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) { + MMI = &getAnalysis<MachineModuleInfo>(); + MCE.setModuleInfo(MMI); + + II = TM.getInstrInfo(); + TD = TM.getTargetData(); + Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit(); + IsPIC = TM.getRelocationModel() == Reloc::PIC_; + + do { + DEBUG(dbgs() << "JITTing function '" + << MF.getFunction()->getName() << "'\n"); + MCE.startFunction(MF); + for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); + MBB != E; ++MBB) { + MCE.StartMachineBasicBlock(MBB); + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); + I != E; ++I) { + const MCInstrDesc &Desc = I->getDesc(); + emitInstruction(*I, &Desc); + // MOVPC32r is basically a call plus a pop instruction. + if (Desc.getOpcode() == X86::MOVPC32r) + emitInstruction(*I, &II->get(X86::POP32r)); + ++NumEmitted; // Keep track of the # of mi's emitted + } + } + } while (MCE.finishFunction(MF)); + + return false; +} + +/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64 +/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand +/// size, and 3) use of X86-64 extended registers. +static unsigned determineREX(const MachineInstr &MI) { + unsigned REX = 0; + const MCInstrDesc &Desc = MI.getDesc(); + + // Pseudo instructions do not need REX prefix byte. + if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo) + return 0; + if (Desc.TSFlags & X86II::REX_W) + REX |= 1 << 3; + + unsigned NumOps = Desc.getNumOperands(); + if (NumOps) { + bool isTwoAddr = NumOps > 1 && + Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1; + + // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix. + unsigned i = isTwoAddr ? 1 : 0; + for (unsigned e = NumOps; i != e; ++i) { + const MachineOperand& MO = MI.getOperand(i); + if (MO.isReg()) { + unsigned Reg = MO.getReg(); + if (X86II::isX86_64NonExtLowByteReg(Reg)) + REX |= 0x40; + } + } + + switch (Desc.TSFlags & X86II::FormMask) { + case X86II::MRMInitReg: + if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0))) + REX |= (1 << 0) | (1 << 2); + break; + case X86II::MRMSrcReg: { + if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0))) + REX |= 1 << 2; + i = isTwoAddr ? 2 : 1; + for (unsigned e = NumOps; i != e; ++i) { + const MachineOperand& MO = MI.getOperand(i); + if (X86InstrInfo::isX86_64ExtendedReg(MO)) + REX |= 1 << 0; + } + break; + } + case X86II::MRMSrcMem: { + if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0))) + REX |= 1 << 2; + unsigned Bit = 0; + i = isTwoAddr ? 2 : 1; + for (; i != NumOps; ++i) { + const MachineOperand& MO = MI.getOperand(i); + if (MO.isReg()) { + if (X86InstrInfo::isX86_64ExtendedReg(MO)) + REX |= 1 << Bit; + Bit++; + } + } + break; + } + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: + case X86II::MRMDestMem: { + unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands); + i = isTwoAddr ? 1 : 0; + if (NumOps > e && X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(e))) + REX |= 1 << 2; + unsigned Bit = 0; + for (; i != e; ++i) { + const MachineOperand& MO = MI.getOperand(i); + if (MO.isReg()) { + if (X86InstrInfo::isX86_64ExtendedReg(MO)) + REX |= 1 << Bit; + Bit++; + } + } + break; + } + default: { + if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0))) + REX |= 1 << 0; + i = isTwoAddr ? 2 : 1; + for (unsigned e = NumOps; i != e; ++i) { + const MachineOperand& MO = MI.getOperand(i); + if (X86InstrInfo::isX86_64ExtendedReg(MO)) + REX |= 1 << 2; + } + break; + } + } + } + return REX; +} + + +/// emitPCRelativeBlockAddress - This method keeps track of the information +/// necessary to resolve the address of this block later and emits a dummy +/// value. +/// +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) { + // Remember where this reference was and where it is to so we can + // deal with it later. + MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(), + X86::reloc_pcrel_word, MBB)); + MCE.emitWordLE(0); +} + +/// emitGlobalAddress - Emit the specified address to the code stream assuming +/// this is part of a "take the address of a global" instruction. +/// +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitGlobalAddress(const GlobalValue *GV, + unsigned Reloc, + intptr_t Disp /* = 0 */, + intptr_t PCAdj /* = 0 */, + bool Indirect /* = false */) { + intptr_t RelocCST = Disp; + if (Reloc == X86::reloc_picrel_word) + RelocCST = PICBaseOffset; + else if (Reloc == X86::reloc_pcrel_word) + RelocCST = PCAdj; + MachineRelocation MR = Indirect + ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc, + const_cast<GlobalValue *>(GV), + RelocCST, false) + : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc, + const_cast<GlobalValue *>(GV), RelocCST, false); + MCE.addRelocation(MR); + // The relocated value will be added to the displacement + if (Reloc == X86::reloc_absolute_dword) + MCE.emitDWordLE(Disp); + else + MCE.emitWordLE((int32_t)Disp); +} + +/// emitExternalSymbolAddress - Arrange for the address of an external symbol to +/// be emitted to the current location in the function, and allow it to be PC +/// relative. +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitExternalSymbolAddress(const char *ES, + unsigned Reloc) { + intptr_t RelocCST = (Reloc == X86::reloc_picrel_word) ? PICBaseOffset : 0; + + // X86 never needs stubs because instruction selection will always pick + // an instruction sequence that is large enough to hold any address + // to a symbol. + // (see X86ISelLowering.cpp, near 2039: X86TargetLowering::LowerCall) + bool NeedStub = false; + MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(), + Reloc, ES, RelocCST, + 0, NeedStub)); + if (Reloc == X86::reloc_absolute_dword) + MCE.emitDWordLE(0); + else + MCE.emitWordLE(0); +} + +/// emitConstPoolAddress - Arrange for the address of an constant pool +/// to be emitted to the current location in the function, and allow it to be PC +/// relative. +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI, unsigned Reloc, + intptr_t Disp /* = 0 */, + intptr_t PCAdj /* = 0 */) { + intptr_t RelocCST = 0; + if (Reloc == X86::reloc_picrel_word) + RelocCST = PICBaseOffset; + else if (Reloc == X86::reloc_pcrel_word) + RelocCST = PCAdj; + MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(), + Reloc, CPI, RelocCST)); + // The relocated value will be added to the displacement + if (Reloc == X86::reloc_absolute_dword) + MCE.emitDWordLE(Disp); + else + MCE.emitWordLE((int32_t)Disp); +} + +/// emitJumpTableAddress - Arrange for the address of a jump table to +/// be emitted to the current location in the function, and allow it to be PC +/// relative. +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTI, unsigned Reloc, + intptr_t PCAdj /* = 0 */) { + intptr_t RelocCST = 0; + if (Reloc == X86::reloc_picrel_word) + RelocCST = PICBaseOffset; + else if (Reloc == X86::reloc_pcrel_word) + RelocCST = PCAdj; + MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(), + Reloc, JTI, RelocCST)); + // The relocated value will be added to the displacement + if (Reloc == X86::reloc_absolute_dword) + MCE.emitDWordLE(0); + else + MCE.emitWordLE(0); +} + +inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode, + unsigned RM) { + assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!"); + return RM | (RegOpcode << 3) | (Mod << 6); +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg, + unsigned RegOpcodeFld){ + MCE.emitByte(ModRMByte(3, RegOpcodeFld, X86_MC::getX86RegNum(ModRMReg))); +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) { + MCE.emitByte(ModRMByte(3, RegOpcodeFld, 0)); +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitSIBByte(unsigned SS, + unsigned Index, + unsigned Base) { + // SIB byte is in the same format as the ModRMByte... + MCE.emitByte(ModRMByte(SS, Index, Base)); +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) { + // Output the constant in little endian byte order... + for (unsigned i = 0; i != Size; ++i) { + MCE.emitByte(Val & 255); + Val >>= 8; + } +} + +/// isDisp8 - Return true if this signed displacement fits in a 8-bit +/// sign-extended field. +static bool isDisp8(int Value) { + return Value == (signed char)Value; +} + +static bool gvNeedsNonLazyPtr(const MachineOperand &GVOp, + const TargetMachine &TM) { + // For Darwin-64, simulate the linktime GOT by using the same non-lazy-pointer + // mechanism as 32-bit mode. + if (TM.getSubtarget<X86Subtarget>().is64Bit() && + !TM.getSubtarget<X86Subtarget>().isTargetDarwin()) + return false; + + // Return true if this is a reference to a stub containing the address of the + // global, not the global itself. + return isGlobalStubReference(GVOp.getTargetFlags()); +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp, + int DispVal, + intptr_t Adj /* = 0 */, + bool IsPCRel /* = true */) { + // If this is a simple integer displacement that doesn't require a relocation, + // emit it now. + if (!RelocOp) { + emitConstant(DispVal, 4); + return; + } + + // Otherwise, this is something that requires a relocation. Emit it as such + // now. + unsigned RelocType = Is64BitMode ? + (IsPCRel ? X86::reloc_pcrel_word : X86::reloc_absolute_word_sext) + : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); + if (RelocOp->isGlobal()) { + // In 64-bit static small code model, we could potentially emit absolute. + // But it's probably not beneficial. If the MCE supports using RIP directly + // do it, otherwise fallback to absolute (this is determined by IsPCRel). + // 89 05 00 00 00 00 mov %eax,0(%rip) # PC-relative + // 89 04 25 00 00 00 00 mov %eax,0x0 # Absolute + bool Indirect = gvNeedsNonLazyPtr(*RelocOp, TM); + emitGlobalAddress(RelocOp->getGlobal(), RelocType, RelocOp->getOffset(), + Adj, Indirect); + } else if (RelocOp->isSymbol()) { + emitExternalSymbolAddress(RelocOp->getSymbolName(), RelocType); + } else if (RelocOp->isCPI()) { + emitConstPoolAddress(RelocOp->getIndex(), RelocType, + RelocOp->getOffset(), Adj); + } else { + assert(RelocOp->isJTI() && "Unexpected machine operand!"); + emitJumpTableAddress(RelocOp->getIndex(), RelocType, Adj); + } +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI, + unsigned Op,unsigned RegOpcodeField, + intptr_t PCAdj) { + const MachineOperand &Op3 = MI.getOperand(Op+3); + int DispVal = 0; + const MachineOperand *DispForReloc = 0; + + // Figure out what sort of displacement we have to handle here. + if (Op3.isGlobal()) { + DispForReloc = &Op3; + } else if (Op3.isSymbol()) { + DispForReloc = &Op3; + } else if (Op3.isCPI()) { + if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) { + DispForReloc = &Op3; + } else { + DispVal += MCE.getConstantPoolEntryAddress(Op3.getIndex()); + DispVal += Op3.getOffset(); + } + } else if (Op3.isJTI()) { + if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) { + DispForReloc = &Op3; + } else { + DispVal += MCE.getJumpTableEntryAddress(Op3.getIndex()); + } + } else { + DispVal = Op3.getImm(); + } + + const MachineOperand &Base = MI.getOperand(Op); + const MachineOperand &Scale = MI.getOperand(Op+1); + const MachineOperand &IndexReg = MI.getOperand(Op+2); + + unsigned BaseReg = Base.getReg(); + + // Handle %rip relative addressing. + if (BaseReg == X86::RIP || + (Is64BitMode && DispForReloc)) { // [disp32+RIP] in X86-64 mode + assert(IndexReg.getReg() == 0 && Is64BitMode && + "Invalid rip-relative address"); + MCE.emitByte(ModRMByte(0, RegOpcodeField, 5)); + emitDisplacementField(DispForReloc, DispVal, PCAdj, true); + return; + } + + // Indicate that the displacement will use an pcrel or absolute reference + // by default. MCEs able to resolve addresses on-the-fly use pcrel by default + // while others, unless explicit asked to use RIP, use absolute references. + bool IsPCRel = MCE.earlyResolveAddresses() ? true : false; + + // Is a SIB byte needed? + // If no BaseReg, issue a RIP relative instruction only if the MCE can + // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table + // 2-7) and absolute references. + unsigned BaseRegNo = -1U; + if (BaseReg != 0 && BaseReg != X86::RIP) + BaseRegNo = X86_MC::getX86RegNum(BaseReg); + + if (// The SIB byte must be used if there is an index register. + IndexReg.getReg() == 0 && + // The SIB byte must be used if the base is ESP/RSP/R12, all of which + // encode to an R/M value of 4, which indicates that a SIB byte is + // present. + BaseRegNo != N86::ESP && + // If there is no base register and we're in 64-bit mode, we need a SIB + // byte to emit an addr that is just 'disp32' (the non-RIP relative form). + (!Is64BitMode || BaseReg != 0)) { + if (BaseReg == 0 || // [disp32] in X86-32 mode + BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode + MCE.emitByte(ModRMByte(0, RegOpcodeField, 5)); + emitDisplacementField(DispForReloc, DispVal, PCAdj, true); + return; + } + + // If the base is not EBP/ESP and there is no displacement, use simple + // indirect register encoding, this handles addresses like [EAX]. The + // encoding for [EBP] with no displacement means [disp32] so we handle it + // by emitting a displacement of 0 below. + if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) { + MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo)); + return; + } + + // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. + if (!DispForReloc && isDisp8(DispVal)) { + MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo)); + emitConstant(DispVal, 1); + return; + } + + // Otherwise, emit the most general non-SIB encoding: [REG+disp32] + MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo)); + emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel); + return; + } + + // Otherwise we need a SIB byte, so start by outputting the ModR/M byte first. + assert(IndexReg.getReg() != X86::ESP && + IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!"); + + bool ForceDisp32 = false; + bool ForceDisp8 = false; + if (BaseReg == 0) { + // If there is no base register, we emit the special case SIB byte with + // MOD=0, BASE=4, to JUST get the index, scale, and displacement. + MCE.emitByte(ModRMByte(0, RegOpcodeField, 4)); + ForceDisp32 = true; + } else if (DispForReloc) { + // Emit the normal disp32 encoding. + MCE.emitByte(ModRMByte(2, RegOpcodeField, 4)); + ForceDisp32 = true; + } else if (DispVal == 0 && BaseRegNo != N86::EBP) { + // Emit no displacement ModR/M byte + MCE.emitByte(ModRMByte(0, RegOpcodeField, 4)); + } else if (isDisp8(DispVal)) { + // Emit the disp8 encoding... + MCE.emitByte(ModRMByte(1, RegOpcodeField, 4)); + ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP + } else { + // Emit the normal disp32 encoding... + MCE.emitByte(ModRMByte(2, RegOpcodeField, 4)); + } + + // Calculate what the SS field value should be... + static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 }; + unsigned SS = SSTable[Scale.getImm()]; + + if (BaseReg == 0) { + // Handle the SIB byte for the case where there is no base, see Intel + // Manual 2A, table 2-7. The displacement has already been output. + unsigned IndexRegNo; + if (IndexReg.getReg()) + IndexRegNo = X86_MC::getX86RegNum(IndexReg.getReg()); + else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5) + IndexRegNo = 4; + emitSIBByte(SS, IndexRegNo, 5); + } else { + unsigned BaseRegNo = X86_MC::getX86RegNum(BaseReg); + unsigned IndexRegNo; + if (IndexReg.getReg()) + IndexRegNo = X86_MC::getX86RegNum(IndexReg.getReg()); + else + IndexRegNo = 4; // For example [ESP+1*<noreg>+4] + emitSIBByte(SS, IndexRegNo, BaseRegNo); + } + + // Do we need to output a displacement? + if (ForceDisp8) { + emitConstant(DispVal, 1); + } else if (DispVal != 0 || ForceDisp32) { + emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel); + } +} + +template<class CodeEmitter> +void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, + const MCInstrDesc *Desc) { + DEBUG(dbgs() << MI); + + // If this is a pseudo instruction, lower it. + switch (Desc->getOpcode()) { + case X86::ADD16rr_DB: Desc = &II->get(X86::OR16rr); MI.setDesc(*Desc);break; + case X86::ADD32rr_DB: Desc = &II->get(X86::OR32rr); MI.setDesc(*Desc);break; + case X86::ADD64rr_DB: Desc = &II->get(X86::OR64rr); MI.setDesc(*Desc);break; + case X86::ADD16ri_DB: Desc = &II->get(X86::OR16ri); MI.setDesc(*Desc);break; + case X86::ADD32ri_DB: Desc = &II->get(X86::OR32ri); MI.setDesc(*Desc);break; + case X86::ADD64ri32_DB:Desc = &II->get(X86::OR64ri32);MI.setDesc(*Desc);break; + case X86::ADD16ri8_DB: Desc = &II->get(X86::OR16ri8);MI.setDesc(*Desc);break; + case X86::ADD32ri8_DB: Desc = &II->get(X86::OR32ri8);MI.setDesc(*Desc);break; + case X86::ADD64ri8_DB: Desc = &II->get(X86::OR64ri8);MI.setDesc(*Desc);break; + } + + + MCE.processDebugLoc(MI.getDebugLoc(), true); + + unsigned Opcode = Desc->Opcode; + + // Emit the lock opcode prefix as needed. + if (Desc->TSFlags & X86II::LOCK) + MCE.emitByte(0xF0); + + // Emit segment override opcode prefix as needed. + switch (Desc->TSFlags & X86II::SegOvrMask) { + case X86II::FS: + MCE.emitByte(0x64); + break; + case X86II::GS: + MCE.emitByte(0x65); + break; + default: llvm_unreachable("Invalid segment!"); + case 0: break; // No segment override! + } + + // Emit the repeat opcode prefix as needed. + if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP) + MCE.emitByte(0xF3); + + // Emit the operand size opcode prefix as needed. + if (Desc->TSFlags & X86II::OpSize) + MCE.emitByte(0x66); + + // Emit the address size opcode prefix as needed. + if (Desc->TSFlags & X86II::AdSize) + MCE.emitByte(0x67); + + bool Need0FPrefix = false; + switch (Desc->TSFlags & X86II::Op0Mask) { + case X86II::TB: // Two-byte opcode prefix + case X86II::T8: // 0F 38 + case X86II::TA: // 0F 3A + case X86II::A6: // 0F A6 + case X86II::A7: // 0F A7 + Need0FPrefix = true; + break; + case X86II::TF: // F2 0F 38 + MCE.emitByte(0xF2); + Need0FPrefix = true; + break; + case X86II::REP: break; // already handled. + case X86II::XS: // F3 0F + MCE.emitByte(0xF3); + Need0FPrefix = true; + break; + case X86II::XD: // F2 0F + MCE.emitByte(0xF2); + Need0FPrefix = true; + break; + case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB: + case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF: + MCE.emitByte(0xD8+ + (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8) + >> X86II::Op0Shift)); + break; // Two-byte opcode prefix + default: llvm_unreachable("Invalid prefix!"); + case 0: break; // No prefix! + } + + // Handle REX prefix. + if (Is64BitMode) { + if (unsigned REX = determineREX(MI)) + MCE.emitByte(0x40 | REX); + } + + // 0x0F escape code must be emitted just before the opcode. + if (Need0FPrefix) + MCE.emitByte(0x0F); + + switch (Desc->TSFlags & X86II::Op0Mask) { + case X86II::TF: // F2 0F 38 + case X86II::T8: // 0F 38 + MCE.emitByte(0x38); + break; + case X86II::TA: // 0F 3A + MCE.emitByte(0x3A); + break; + case X86II::A6: // 0F A6 + MCE.emitByte(0xA6); + break; + case X86II::A7: // 0F A7 + MCE.emitByte(0xA7); + break; + } + + // If this is a two-address instruction, skip one of the register operands. + unsigned NumOps = Desc->getNumOperands(); + unsigned CurOp = 0; + if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) != -1) + ++CurOp; + else if (NumOps > 2 && Desc->getOperandConstraint(NumOps-1,MCOI::TIED_TO)== 0) + // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32 + --NumOps; + + unsigned char BaseOpcode = X86II::getBaseOpcodeFor(Desc->TSFlags); + switch (Desc->TSFlags & X86II::FormMask) { + default: + llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!"); + case X86II::Pseudo: + // Remember the current PC offset, this is the PIC relocation + // base address. + switch (Opcode) { + default: + llvm_unreachable("pseudo instructions should be removed before code" + " emission"); + break; + // Do nothing for Int_MemBarrier - it's just a comment. Add a debug + // to make it slightly easier to see. + case X86::Int_MemBarrier: + DEBUG(dbgs() << "#MEMBARRIER\n"); + break; + + case TargetOpcode::INLINEASM: + // We allow inline assembler nodes with empty bodies - they can + // implicitly define registers, which is ok for JIT. + if (MI.getOperand(0).getSymbolName()[0]) + report_fatal_error("JIT does not support inline asm!"); + break; + case TargetOpcode::PROLOG_LABEL: + case TargetOpcode::GC_LABEL: + case TargetOpcode::EH_LABEL: + MCE.emitLabel(MI.getOperand(0).getMCSymbol()); + break; + + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + break; + case X86::MOVPC32r: { + // This emits the "call" portion of this pseudo instruction. + MCE.emitByte(BaseOpcode); + emitConstant(0, X86II::getSizeOfImm(Desc->TSFlags)); + // Remember PIC base. + PICBaseOffset = (intptr_t) MCE.getCurrentPCOffset(); + X86JITInfo *JTI = TM.getJITInfo(); + JTI->setPICBase(MCE.getCurrentPCValue()); + break; + } + } + CurOp = NumOps; + break; + case X86II::RawFrm: { + MCE.emitByte(BaseOpcode); + + if (CurOp == NumOps) + break; + + const MachineOperand &MO = MI.getOperand(CurOp++); + + DEBUG(dbgs() << "RawFrm CurOp " << CurOp << "\n"); + DEBUG(dbgs() << "isMBB " << MO.isMBB() << "\n"); + DEBUG(dbgs() << "isGlobal " << MO.isGlobal() << "\n"); + DEBUG(dbgs() << "isSymbol " << MO.isSymbol() << "\n"); + DEBUG(dbgs() << "isImm " << MO.isImm() << "\n"); + + if (MO.isMBB()) { + emitPCRelativeBlockAddress(MO.getMBB()); + break; + } + + if (MO.isGlobal()) { + emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word, + MO.getOffset(), 0); + break; + } + + if (MO.isSymbol()) { + emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word); + break; + } + + // FIXME: Only used by hackish MCCodeEmitter, remove when dead. + if (MO.isJTI()) { + emitJumpTableAddress(MO.getIndex(), X86::reloc_pcrel_word); + break; + } + + assert(MO.isImm() && "Unknown RawFrm operand!"); + if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32 || + Opcode == X86::WINCALL64pcrel32) { + // Fix up immediate operand for pc relative calls. + intptr_t Imm = (intptr_t)MO.getImm(); + Imm = Imm - MCE.getCurrentPCValue() - 4; + emitConstant(Imm, X86II::getSizeOfImm(Desc->TSFlags)); + } else + emitConstant(MO.getImm(), X86II::getSizeOfImm(Desc->TSFlags)); + break; + } + + case X86II::AddRegFrm: { + MCE.emitByte(BaseOpcode + + X86_MC::getX86RegNum(MI.getOperand(CurOp++).getReg())); + + if (CurOp == NumOps) + break; + + const MachineOperand &MO1 = MI.getOperand(CurOp++); + unsigned Size = X86II::getSizeOfImm(Desc->TSFlags); + if (MO1.isImm()) { + emitConstant(MO1.getImm(), Size); + break; + } + + unsigned rt = Is64BitMode ? X86::reloc_pcrel_word + : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); + if (Opcode == X86::MOV64ri64i32) + rt = X86::reloc_absolute_word; // FIXME: add X86II flag? + // This should not occur on Darwin for relocatable objects. + if (Opcode == X86::MOV64ri) + rt = X86::reloc_absolute_dword; // FIXME: add X86II flag? + if (MO1.isGlobal()) { + bool Indirect = gvNeedsNonLazyPtr(MO1, TM); + emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0, + Indirect); + } else if (MO1.isSymbol()) + emitExternalSymbolAddress(MO1.getSymbolName(), rt); + else if (MO1.isCPI()) + emitConstPoolAddress(MO1.getIndex(), rt); + else if (MO1.isJTI()) + emitJumpTableAddress(MO1.getIndex(), rt); + break; + } + + case X86II::MRMDestReg: { + MCE.emitByte(BaseOpcode); + emitRegModRMByte(MI.getOperand(CurOp).getReg(), + X86_MC::getX86RegNum(MI.getOperand(CurOp+1).getReg())); + CurOp += 2; + if (CurOp != NumOps) + emitConstant(MI.getOperand(CurOp++).getImm(), + X86II::getSizeOfImm(Desc->TSFlags)); + break; + } + case X86II::MRMDestMem: { + MCE.emitByte(BaseOpcode); + emitMemModRMByte(MI, CurOp, + X86_MC::getX86RegNum(MI.getOperand(CurOp + X86::AddrNumOperands) + .getReg())); + CurOp += X86::AddrNumOperands + 1; + if (CurOp != NumOps) + emitConstant(MI.getOperand(CurOp++).getImm(), + X86II::getSizeOfImm(Desc->TSFlags)); + break; + } + + case X86II::MRMSrcReg: + MCE.emitByte(BaseOpcode); + emitRegModRMByte(MI.getOperand(CurOp+1).getReg(), + X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg())); + CurOp += 2; + if (CurOp != NumOps) + emitConstant(MI.getOperand(CurOp++).getImm(), + X86II::getSizeOfImm(Desc->TSFlags)); + break; + + case X86II::MRMSrcMem: { + int AddrOperands = X86::AddrNumOperands; + + intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ? + X86II::getSizeOfImm(Desc->TSFlags) : 0; + + MCE.emitByte(BaseOpcode); + emitMemModRMByte(MI, CurOp+1, + X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj); + CurOp += AddrOperands + 1; + if (CurOp != NumOps) + emitConstant(MI.getOperand(CurOp++).getImm(), + X86II::getSizeOfImm(Desc->TSFlags)); + break; + } + + case X86II::MRM0r: case X86II::MRM1r: + case X86II::MRM2r: case X86II::MRM3r: + case X86II::MRM4r: case X86II::MRM5r: + case X86II::MRM6r: case X86II::MRM7r: { + MCE.emitByte(BaseOpcode); + emitRegModRMByte(MI.getOperand(CurOp++).getReg(), + (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r); + + if (CurOp == NumOps) + break; + + const MachineOperand &MO1 = MI.getOperand(CurOp++); + unsigned Size = X86II::getSizeOfImm(Desc->TSFlags); + if (MO1.isImm()) { + emitConstant(MO1.getImm(), Size); + break; + } + + unsigned rt = Is64BitMode ? X86::reloc_pcrel_word + : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); + if (Opcode == X86::MOV64ri32) + rt = X86::reloc_absolute_word_sext; // FIXME: add X86II flag? + if (MO1.isGlobal()) { + bool Indirect = gvNeedsNonLazyPtr(MO1, TM); + emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0, + Indirect); + } else if (MO1.isSymbol()) + emitExternalSymbolAddress(MO1.getSymbolName(), rt); + else if (MO1.isCPI()) + emitConstPoolAddress(MO1.getIndex(), rt); + else if (MO1.isJTI()) + emitJumpTableAddress(MO1.getIndex(), rt); + break; + } + + case X86II::MRM0m: case X86II::MRM1m: + case X86II::MRM2m: case X86II::MRM3m: + case X86II::MRM4m: case X86II::MRM5m: + case X86II::MRM6m: case X86II::MRM7m: { + intptr_t PCAdj = (CurOp + X86::AddrNumOperands != NumOps) ? + (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ? + X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0; + + MCE.emitByte(BaseOpcode); + emitMemModRMByte(MI, CurOp, (Desc->TSFlags & X86II::FormMask)-X86II::MRM0m, + PCAdj); + CurOp += X86::AddrNumOperands; + + if (CurOp == NumOps) + break; + + const MachineOperand &MO = MI.getOperand(CurOp++); + unsigned Size = X86II::getSizeOfImm(Desc->TSFlags); + if (MO.isImm()) { + emitConstant(MO.getImm(), Size); + break; + } + + unsigned rt = Is64BitMode ? X86::reloc_pcrel_word + : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); + if (Opcode == X86::MOV64mi32) + rt = X86::reloc_absolute_word_sext; // FIXME: add X86II flag? + if (MO.isGlobal()) { + bool Indirect = gvNeedsNonLazyPtr(MO, TM); + emitGlobalAddress(MO.getGlobal(), rt, MO.getOffset(), 0, + Indirect); + } else if (MO.isSymbol()) + emitExternalSymbolAddress(MO.getSymbolName(), rt); + else if (MO.isCPI()) + emitConstPoolAddress(MO.getIndex(), rt); + else if (MO.isJTI()) + emitJumpTableAddress(MO.getIndex(), rt); + break; + } + + case X86II::MRMInitReg: + MCE.emitByte(BaseOpcode); + // Duplicate register, used by things like MOV8r0 (aka xor reg,reg). + emitRegModRMByte(MI.getOperand(CurOp).getReg(), + X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg())); + ++CurOp; + break; + + case X86II::MRM_C1: + MCE.emitByte(BaseOpcode); + MCE.emitByte(0xC1); + break; + case X86II::MRM_C8: + MCE.emitByte(BaseOpcode); + MCE.emitByte(0xC8); + break; + case X86II::MRM_C9: + MCE.emitByte(BaseOpcode); + MCE.emitByte(0xC9); + break; + case X86II::MRM_E8: + MCE.emitByte(BaseOpcode); + MCE.emitByte(0xE8); + break; + case X86II::MRM_F0: + MCE.emitByte(BaseOpcode); + MCE.emitByte(0xF0); + break; + } + + if (!Desc->isVariadic() && CurOp != NumOps) { +#ifndef NDEBUG + dbgs() << "Cannot encode all operands of: " << MI << "\n"; +#endif + llvm_unreachable(0); + } + + MCE.processDebugLoc(MI.getDebugLoc(), false); +} diff --git a/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.cpp b/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.cpp new file mode 100644 index 0000000..4a72d15 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.cpp @@ -0,0 +1,153 @@ +//===-- X86ELFWriterInfo.cpp - ELF Writer Info for the X86 backend --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements ELF writer information for the X86 backend. +// +//===----------------------------------------------------------------------===// + +#include "X86ELFWriterInfo.h" +#include "X86Relocations.h" +#include "llvm/Function.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Implementation of the X86ELFWriterInfo class +//===----------------------------------------------------------------------===// + +X86ELFWriterInfo::X86ELFWriterInfo(bool is64Bit_, bool isLittleEndian_) + : TargetELFWriterInfo(is64Bit_, isLittleEndian_) { + EMachine = is64Bit ? EM_X86_64 : EM_386; + } + +X86ELFWriterInfo::~X86ELFWriterInfo() {} + +unsigned X86ELFWriterInfo::getRelocationType(unsigned MachineRelTy) const { + if (is64Bit) { + switch(MachineRelTy) { + case X86::reloc_pcrel_word: + return ELF::R_X86_64_PC32; + case X86::reloc_absolute_word: + return ELF::R_X86_64_32; + case X86::reloc_absolute_word_sext: + return ELF::R_X86_64_32S; + case X86::reloc_absolute_dword: + return ELF::R_X86_64_64; + case X86::reloc_picrel_word: + default: + llvm_unreachable("unknown x86_64 machine relocation type"); + } + } else { + switch(MachineRelTy) { + case X86::reloc_pcrel_word: + return ELF::R_386_PC32; + case X86::reloc_absolute_word: + return ELF::R_386_32; + case X86::reloc_absolute_word_sext: + case X86::reloc_absolute_dword: + case X86::reloc_picrel_word: + default: + llvm_unreachable("unknown x86 machine relocation type"); + } + } + return 0; +} + +long int X86ELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy, + long int Modifier) const { + if (is64Bit) { + switch(RelTy) { + case ELF::R_X86_64_PC32: return Modifier - 4; + case ELF::R_X86_64_32: + case ELF::R_X86_64_32S: + case ELF::R_X86_64_64: + return Modifier; + default: + llvm_unreachable("unknown x86_64 relocation type"); + } + } else { + switch(RelTy) { + case ELF::R_386_PC32: return Modifier - 4; + case ELF::R_386_32: return Modifier; + default: + llvm_unreachable("unknown x86 relocation type"); + } + } + return 0; +} + +unsigned X86ELFWriterInfo::getRelocationTySize(unsigned RelTy) const { + if (is64Bit) { + switch(RelTy) { + case ELF::R_X86_64_PC32: + case ELF::R_X86_64_32: + case ELF::R_X86_64_32S: + return 32; + case ELF::R_X86_64_64: + return 64; + default: + llvm_unreachable("unknown x86_64 relocation type"); + } + } else { + switch(RelTy) { + case ELF::R_386_PC32: + case ELF::R_386_32: + return 32; + default: + llvm_unreachable("unknown x86 relocation type"); + } + } + return 0; +} + +bool X86ELFWriterInfo::isPCRelativeRel(unsigned RelTy) const { + if (is64Bit) { + switch(RelTy) { + case ELF::R_X86_64_PC32: + return true; + case ELF::R_X86_64_32: + case ELF::R_X86_64_32S: + case ELF::R_X86_64_64: + return false; + default: + llvm_unreachable("unknown x86_64 relocation type"); + } + } else { + switch(RelTy) { + case ELF::R_386_PC32: + return true; + case ELF::R_386_32: + return false; + default: + llvm_unreachable("unknown x86 relocation type"); + } + } + return 0; +} + +unsigned X86ELFWriterInfo::getAbsoluteLabelMachineRelTy() const { + return is64Bit ? + X86::reloc_absolute_dword : X86::reloc_absolute_word; +} + +long int X86ELFWriterInfo::computeRelocation(unsigned SymOffset, + unsigned RelOffset, + unsigned RelTy) const { + + if (RelTy == ELF::R_X86_64_PC32 || RelTy == ELF::R_386_PC32) + return SymOffset - (RelOffset + 4); + else + assert(0 && "computeRelocation unknown for this relocation type"); + + return 0; +} diff --git a/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.h b/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.h new file mode 100644 index 0000000..a45b5bb --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ELFWriterInfo.h @@ -0,0 +1,59 @@ +//===-- X86ELFWriterInfo.h - ELF Writer Info for X86 ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements ELF writer information for the X86 backend. +// +//===----------------------------------------------------------------------===// + +#ifndef X86_ELF_WRITER_INFO_H +#define X86_ELF_WRITER_INFO_H + +#include "llvm/Target/TargetELFWriterInfo.h" + +namespace llvm { + + class X86ELFWriterInfo : public TargetELFWriterInfo { + + public: + X86ELFWriterInfo(bool is64Bit_, bool isLittleEndian_); + virtual ~X86ELFWriterInfo(); + + /// getRelocationType - Returns the target specific ELF Relocation type. + /// 'MachineRelTy' contains the object code independent relocation type + virtual unsigned getRelocationType(unsigned MachineRelTy) const; + + /// hasRelocationAddend - True if the target uses an addend in the + /// ELF relocation entry. + virtual bool hasRelocationAddend() const { return is64Bit ? true : false; } + + /// getDefaultAddendForRelTy - Gets the default addend value for a + /// relocation entry based on the target ELF relocation type. + virtual long int getDefaultAddendForRelTy(unsigned RelTy, + long int Modifier = 0) const; + + /// getRelTySize - Returns the size of relocatable field in bits + virtual unsigned getRelocationTySize(unsigned RelTy) const; + + /// isPCRelativeRel - True if the relocation type is pc relative + virtual bool isPCRelativeRel(unsigned RelTy) const; + + /// getJumpTableRelocationTy - Returns the machine relocation type used + /// to reference a jumptable. + virtual unsigned getAbsoluteLabelMachineRelTy() const; + + /// computeRelocation - Some relocatable fields could be relocated + /// directly, avoiding the relocation symbol emission, compute the + /// final relocation value for this symbol. + virtual long int computeRelocation(unsigned SymOffset, unsigned RelOffset, + unsigned RelTy) const; + }; + +} // end llvm namespace + +#endif // X86_ELF_WRITER_INFO_H diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp new file mode 100644 index 0000000..f912b28 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -0,0 +1,2162 @@ +//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86-specific support for the FastISel class. Much +// of the target-specific code is generated by tablegen in the file +// X86GenFastISel.inc, which is #included here. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86ISelLowering.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/CallingConv.h" +#include "llvm/DerivedTypes.h" +#include "llvm/GlobalVariable.h" +#include "llvm/GlobalAlias.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Operator.h" +#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +namespace { + +class X86FastISel : public FastISel { + /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget *Subtarget; + + /// StackPtr - Register used as the stack pointer. + /// + unsigned StackPtr; + + /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 + /// floating point ops. + /// When SSE is available, use it for f32 operations. + /// When SSE2 is available, use it for f64 operations. + bool X86ScalarSSEf64; + bool X86ScalarSSEf32; + +public: + explicit X86FastISel(FunctionLoweringInfo &funcInfo) : FastISel(funcInfo) { + Subtarget = &TM.getSubtarget<X86Subtarget>(); + StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; + X86ScalarSSEf64 = Subtarget->hasSSE2() || Subtarget->hasAVX(); + X86ScalarSSEf32 = Subtarget->hasSSE1() || Subtarget->hasAVX(); + } + + virtual bool TargetSelectInstruction(const Instruction *I); + + /// TryToFoldLoad - The specified machine instr operand is a vreg, and that + /// vreg is being provided by the specified load instruction. If possible, + /// try to fold the load as an operand to the instruction, returning true if + /// possible. + virtual bool TryToFoldLoad(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI); + +#include "X86GenFastISel.inc" + +private: + bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT); + + bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR); + + bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM); + bool X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM); + + bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, + unsigned &ResultReg); + + bool X86SelectAddress(const Value *V, X86AddressMode &AM); + bool X86SelectCallAddress(const Value *V, X86AddressMode &AM); + + bool X86SelectLoad(const Instruction *I); + + bool X86SelectStore(const Instruction *I); + + bool X86SelectRet(const Instruction *I); + + bool X86SelectCmp(const Instruction *I); + + bool X86SelectZExt(const Instruction *I); + + bool X86SelectBranch(const Instruction *I); + + bool X86SelectShift(const Instruction *I); + + bool X86SelectSelect(const Instruction *I); + + bool X86SelectTrunc(const Instruction *I); + + bool X86SelectFPExt(const Instruction *I); + bool X86SelectFPTrunc(const Instruction *I); + + bool X86VisitIntrinsicCall(const IntrinsicInst &I); + bool X86SelectCall(const Instruction *I); + + bool DoSelectCall(const Instruction *I, const char *MemIntName); + + const X86InstrInfo *getInstrInfo() const { + return getTargetMachine()->getInstrInfo(); + } + const X86TargetMachine *getTargetMachine() const { + return static_cast<const X86TargetMachine *>(&TM); + } + + unsigned TargetMaterializeConstant(const Constant *C); + + unsigned TargetMaterializeAlloca(const AllocaInst *C); + + unsigned TargetMaterializeFloatZero(const ConstantFP *CF); + + /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is + /// computed in an SSE register, not on the X87 floating point stack. + bool isScalarFPTypeInSSEReg(EVT VT) const { + return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 + (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 + } + + bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false); + + bool IsMemcpySmall(uint64_t Len); + + bool TryEmitSmallMemcpy(X86AddressMode DestAM, + X86AddressMode SrcAM, uint64_t Len); +}; + +} // end anonymous namespace. + +bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { + EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true); + if (evt == MVT::Other || !evt.isSimple()) + // Unhandled type. Halt "fast" selection and bail. + return false; + + VT = evt.getSimpleVT(); + // For now, require SSE/SSE2 for performing floating-point operations, + // since x87 requires additional work. + if (VT == MVT::f64 && !X86ScalarSSEf64) + return false; + if (VT == MVT::f32 && !X86ScalarSSEf32) + return false; + // Similarly, no f80 support yet. + if (VT == MVT::f80) + return false; + // We only handle legal types. For example, on x86-32 the instruction + // selector contains all of the 64-bit instructions from x86-64, + // under the assumption that i64 won't be used if the target doesn't + // support it. + return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT); +} + +#include "X86GenCallingConv.inc" + +/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT. +/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. +/// Return true and the result register by reference if it is possible. +bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, + unsigned &ResultReg) { + // Get opcode and regclass of the output for the given load instruction. + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + switch (VT.getSimpleVT().SimpleTy) { + default: return false; + case MVT::i1: + case MVT::i8: + Opc = X86::MOV8rm; + RC = X86::GR8RegisterClass; + break; + case MVT::i16: + Opc = X86::MOV16rm; + RC = X86::GR16RegisterClass; + break; + case MVT::i32: + Opc = X86::MOV32rm; + RC = X86::GR32RegisterClass; + break; + case MVT::i64: + // Must be in x86-64 mode. + Opc = X86::MOV64rm; + RC = X86::GR64RegisterClass; + break; + case MVT::f32: + if (X86ScalarSSEf32) { + Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; + RC = X86::FR32RegisterClass; + } else { + Opc = X86::LD_Fp32m; + RC = X86::RFP32RegisterClass; + } + break; + case MVT::f64: + if (X86ScalarSSEf64) { + Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; + RC = X86::FR64RegisterClass; + } else { + Opc = X86::LD_Fp64m; + RC = X86::RFP64RegisterClass; + } + break; + case MVT::f80: + // No f80 support yet. + return false; + } + + ResultReg = createResultReg(RC); + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, + DL, TII.get(Opc), ResultReg), AM); + return true; +} + +/// X86FastEmitStore - Emit a machine instruction to store a value Val of +/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr +/// and a displacement offset, or a GlobalAddress, +/// i.e. V. Return true if it is possible. +bool +X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) { + // Get opcode and regclass of the output for the given store instruction. + unsigned Opc = 0; + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f80: // No f80 support yet. + default: return false; + case MVT::i1: { + // Mask out all but lowest bit. + unsigned AndResult = createResultReg(X86::GR8RegisterClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1); + Val = AndResult; + } + // FALLTHROUGH, handling i1 as i8. + case MVT::i8: Opc = X86::MOV8mr; break; + case MVT::i16: Opc = X86::MOV16mr; break; + case MVT::i32: Opc = X86::MOV32mr; break; + case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode. + case MVT::f32: + Opc = X86ScalarSSEf32 ? + (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m; + break; + case MVT::f64: + Opc = X86ScalarSSEf64 ? + (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m; + break; + } + + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, + DL, TII.get(Opc)), AM).addReg(Val); + return true; +} + +bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, + const X86AddressMode &AM) { + // Handle 'null' like i32/i64 0. + if (isa<ConstantPointerNull>(Val)) + Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext())); + + // If this is a store of a simple constant, fold the constant into the store. + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) { + unsigned Opc = 0; + bool Signed = true; + switch (VT.getSimpleVT().SimpleTy) { + default: break; + case MVT::i1: Signed = false; // FALLTHROUGH to handle as i8. + case MVT::i8: Opc = X86::MOV8mi; break; + case MVT::i16: Opc = X86::MOV16mi; break; + case MVT::i32: Opc = X86::MOV32mi; break; + case MVT::i64: + // Must be a 32-bit sign extended value. + if ((int)CI->getSExtValue() == CI->getSExtValue()) + Opc = X86::MOV64mi32; + break; + } + + if (Opc) { + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, + DL, TII.get(Opc)), AM) + .addImm(Signed ? (uint64_t) CI->getSExtValue() : + CI->getZExtValue()); + return true; + } + } + + unsigned ValReg = getRegForValue(Val); + if (ValReg == 0) + return false; + + return X86FastEmitStore(VT, ValReg, AM); +} + +/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of +/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g. +/// ISD::SIGN_EXTEND). +bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, + unsigned Src, EVT SrcVT, + unsigned &ResultReg) { + unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, + Src, /*TODO: Kill=*/false); + + if (RR != 0) { + ResultReg = RR; + return true; + } else + return false; +} + +/// X86SelectAddress - Attempt to fill in an address from the given value. +/// +bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { + const User *U = NULL; + unsigned Opcode = Instruction::UserOp1; + if (const Instruction *I = dyn_cast<Instruction>(V)) { + // Don't walk into other basic blocks; it's possible we haven't + // visited them yet, so the instructions may not yet be assigned + // virtual registers. + if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) || + FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { + Opcode = I->getOpcode(); + U = I; + } + } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) { + Opcode = C->getOpcode(); + U = C; + } + + if (PointerType *Ty = dyn_cast<PointerType>(V->getType())) + if (Ty->getAddressSpace() > 255) + // Fast instruction selection doesn't support the special + // address spaces. + return false; + + switch (Opcode) { + default: break; + case Instruction::BitCast: + // Look past bitcasts. + return X86SelectAddress(U->getOperand(0), AM); + + case Instruction::IntToPtr: + // Look past no-op inttoptrs. + if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + return X86SelectAddress(U->getOperand(0), AM); + break; + + case Instruction::PtrToInt: + // Look past no-op ptrtoints. + if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) + return X86SelectAddress(U->getOperand(0), AM); + break; + + case Instruction::Alloca: { + // Do static allocas. + const AllocaInst *A = cast<AllocaInst>(V); + DenseMap<const AllocaInst*, int>::iterator SI = + FuncInfo.StaticAllocaMap.find(A); + if (SI != FuncInfo.StaticAllocaMap.end()) { + AM.BaseType = X86AddressMode::FrameIndexBase; + AM.Base.FrameIndex = SI->second; + return true; + } + break; + } + + case Instruction::Add: { + // Adds of constants are common and easy enough. + if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) { + uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue(); + // They have to fit in the 32-bit signed displacement field though. + if (isInt<32>(Disp)) { + AM.Disp = (uint32_t)Disp; + return X86SelectAddress(U->getOperand(0), AM); + } + } + break; + } + + case Instruction::GetElementPtr: { + X86AddressMode SavedAM = AM; + + // Pattern-match simple GEPs. + uint64_t Disp = (int32_t)AM.Disp; + unsigned IndexReg = AM.IndexReg; + unsigned Scale = AM.Scale; + gep_type_iterator GTI = gep_type_begin(U); + // Iterate through the indices, folding what we can. Constants can be + // folded, and one dynamic index can be handled, if the scale is supported. + for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); + i != e; ++i, ++GTI) { + const Value *Op = *i; + if (StructType *STy = dyn_cast<StructType>(*GTI)) { + const StructLayout *SL = TD.getStructLayout(STy); + Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue()); + continue; + } + + // A array/variable index is always of the form i*S where S is the + // constant scale size. See if we can push the scale into immediates. + uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType()); + for (;;) { + if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { + // Constant-offset addressing. + Disp += CI->getSExtValue() * S; + break; + } + if (isa<AddOperator>(Op) && + (!isa<Instruction>(Op) || + FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()] + == FuncInfo.MBB) && + isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) { + // An add (in the same block) with a constant operand. Fold the + // constant. + ConstantInt *CI = + cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1)); + Disp += CI->getSExtValue() * S; + // Iterate on the other operand. + Op = cast<AddOperator>(Op)->getOperand(0); + continue; + } + if (IndexReg == 0 && + (!AM.GV || !Subtarget->isPICStyleRIPRel()) && + (S == 1 || S == 2 || S == 4 || S == 8)) { + // Scaled-index addressing. + Scale = S; + IndexReg = getRegForGEPIndex(Op).first; + if (IndexReg == 0) + return false; + break; + } + // Unsupported. + goto unsupported_gep; + } + } + // Check for displacement overflow. + if (!isInt<32>(Disp)) + break; + // Ok, the GEP indices were covered by constant-offset and scaled-index + // addressing. Update the address state and move on to examining the base. + AM.IndexReg = IndexReg; + AM.Scale = Scale; + AM.Disp = (uint32_t)Disp; + if (X86SelectAddress(U->getOperand(0), AM)) + return true; + + // If we couldn't merge the gep value into this addr mode, revert back to + // our address and just match the value instead of completely failing. + AM = SavedAM; + break; + unsupported_gep: + // Ok, the GEP indices weren't all covered. + break; + } + } + + // Handle constant address. + if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return false; + + // Can't handle TLS yet. + if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) + if (GVar->isThreadLocal()) + return false; + + // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how + // it works...). + if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) + if (const GlobalVariable *GVar = + dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false))) + if (GVar->isThreadLocal()) + return false; + + // RIP-relative addresses can't have additional register operands, so if + // we've already folded stuff into the addressing mode, just force the + // global value into its own register, which we can use as the basereg. + if (!Subtarget->isPICStyleRIPRel() || + (AM.Base.Reg == 0 && AM.IndexReg == 0)) { + // Okay, we've committed to selecting this global. Set up the address. + AM.GV = GV; + + // Allow the subtarget to classify the global. + unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM); + + // If this reference is relative to the pic base, set it now. + if (isGlobalRelativeToPICBase(GVFlags)) { + // FIXME: How do we know Base.Reg is free?? + AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } + + // Unless the ABI requires an extra load, return a direct reference to + // the global. + if (!isGlobalStubReference(GVFlags)) { + if (Subtarget->isPICStyleRIPRel()) { + // Use rip-relative addressing if we can. Above we verified that the + // base and index registers are unused. + assert(AM.Base.Reg == 0 && AM.IndexReg == 0); + AM.Base.Reg = X86::RIP; + } + AM.GVOpFlags = GVFlags; + return true; + } + + // Ok, we need to do a load from a stub. If we've already loaded from + // this stub, reuse the loaded pointer, otherwise emit the load now. + DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V); + unsigned LoadReg; + if (I != LocalValueMap.end() && I->second != 0) { + LoadReg = I->second; + } else { + // Issue load from stub. + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + X86AddressMode StubAM; + StubAM.Base.Reg = AM.Base.Reg; + StubAM.GV = GV; + StubAM.GVOpFlags = GVFlags; + + // Prepare for inserting code in the local-value area. + SavePoint SaveInsertPt = enterLocalValueArea(); + + if (TLI.getPointerTy() == MVT::i64) { + Opc = X86::MOV64rm; + RC = X86::GR64RegisterClass; + + if (Subtarget->isPICStyleRIPRel()) + StubAM.Base.Reg = X86::RIP; + } else { + Opc = X86::MOV32rm; + RC = X86::GR32RegisterClass; + } + + LoadReg = createResultReg(RC); + MachineInstrBuilder LoadMI = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg); + addFullAddress(LoadMI, StubAM); + + // Ok, back to normal mode. + leaveLocalValueArea(SaveInsertPt); + + // Prevent loading GV stub multiple times in same MBB. + LocalValueMap[V] = LoadReg; + } + + // Now construct the final address. Note that the Disp, Scale, + // and Index values may already be set here. + AM.Base.Reg = LoadReg; + AM.GV = 0; + return true; + } + } + + // If all else fails, try to materialize the value in a register. + if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { + if (AM.Base.Reg == 0) { + AM.Base.Reg = getRegForValue(V); + return AM.Base.Reg != 0; + } + if (AM.IndexReg == 0) { + assert(AM.Scale == 1 && "Scale with no index!"); + AM.IndexReg = getRegForValue(V); + return AM.IndexReg != 0; + } + } + + return false; +} + +/// X86SelectCallAddress - Attempt to fill in an address from the given value. +/// +bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { + const User *U = NULL; + unsigned Opcode = Instruction::UserOp1; + if (const Instruction *I = dyn_cast<Instruction>(V)) { + Opcode = I->getOpcode(); + U = I; + } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) { + Opcode = C->getOpcode(); + U = C; + } + + switch (Opcode) { + default: break; + case Instruction::BitCast: + // Look past bitcasts. + return X86SelectCallAddress(U->getOperand(0), AM); + + case Instruction::IntToPtr: + // Look past no-op inttoptrs. + if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + return X86SelectCallAddress(U->getOperand(0), AM); + break; + + case Instruction::PtrToInt: + // Look past no-op ptrtoints. + if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) + return X86SelectCallAddress(U->getOperand(0), AM); + break; + } + + // Handle constant address. + if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) { + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return false; + + // RIP-relative addresses can't have additional register operands. + if (Subtarget->isPICStyleRIPRel() && + (AM.Base.Reg != 0 || AM.IndexReg != 0)) + return false; + + // Can't handle DLLImport. + if (GV->hasDLLImportLinkage()) + return false; + + // Can't handle TLS. + if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) + if (GVar->isThreadLocal()) + return false; + + // Okay, we've committed to selecting this global. Set up the basic address. + AM.GV = GV; + + // No ABI requires an extra load for anything other than DLLImport, which + // we rejected above. Return a direct reference to the global. + if (Subtarget->isPICStyleRIPRel()) { + // Use rip-relative addressing if we can. Above we verified that the + // base and index registers are unused. + assert(AM.Base.Reg == 0 && AM.IndexReg == 0); + AM.Base.Reg = X86::RIP; + } else if (Subtarget->isPICStyleStubPIC()) { + AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET; + } else if (Subtarget->isPICStyleGOT()) { + AM.GVOpFlags = X86II::MO_GOTOFF; + } + + return true; + } + + // If all else fails, try to materialize the value in a register. + if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { + if (AM.Base.Reg == 0) { + AM.Base.Reg = getRegForValue(V); + return AM.Base.Reg != 0; + } + if (AM.IndexReg == 0) { + assert(AM.Scale == 1 && "Scale with no index!"); + AM.IndexReg = getRegForValue(V); + return AM.IndexReg != 0; + } + } + + return false; +} + + +/// X86SelectStore - Select and emit code to implement store instructions. +bool X86FastISel::X86SelectStore(const Instruction *I) { + // Atomic stores need special handling. + if (cast<StoreInst>(I)->isAtomic()) + return false; + + MVT VT; + if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true)) + return false; + + X86AddressMode AM; + if (!X86SelectAddress(I->getOperand(1), AM)) + return false; + + return X86FastEmitStore(VT, I->getOperand(0), AM); +} + +/// X86SelectRet - Select and emit code to implement ret instructions. +bool X86FastISel::X86SelectRet(const Instruction *I) { + const ReturnInst *Ret = cast<ReturnInst>(I); + const Function &F = *I->getParent()->getParent(); + + if (!FuncInfo.CanLowerReturn) + return false; + + CallingConv::ID CC = F.getCallingConv(); + if (CC != CallingConv::C && + CC != CallingConv::Fast && + CC != CallingConv::X86_FastCall) + return false; + + if (Subtarget->isTargetWin64()) + return false; + + // Don't handle popping bytes on return for now. + if (FuncInfo.MF->getInfo<X86MachineFunctionInfo>() + ->getBytesToPopOnReturn() != 0) + return 0; + + // fastcc with -tailcallopt is intended to provide a guaranteed + // tail call optimization. Fastisel doesn't know how to do that. + if (CC == CallingConv::Fast && GuaranteedTailCallOpt) + return false; + + // Let SDISel handle vararg functions. + if (F.isVarArg()) + return false; + + if (Ret->getNumOperands() > 0) { + SmallVector<ISD::OutputArg, 4> Outs; + GetReturnInfo(F.getReturnType(), F.getAttributes().getRetAttributes(), + Outs, TLI); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ValLocs; + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, + I->getContext()); + CCInfo.AnalyzeReturn(Outs, RetCC_X86); + + const Value *RV = Ret->getOperand(0); + unsigned Reg = getRegForValue(RV); + if (Reg == 0) + return false; + + // Only handle a single return value for now. + if (ValLocs.size() != 1) + return false; + + CCValAssign &VA = ValLocs[0]; + + // Don't bother handling odd stuff for now. + if (VA.getLocInfo() != CCValAssign::Full) + return false; + // Only handle register returns for now. + if (!VA.isRegLoc()) + return false; + + // The calling-convention tables for x87 returns don't tell + // the whole story. + if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) + return false; + + unsigned SrcReg = Reg + VA.getValNo(); + EVT SrcVT = TLI.getValueType(RV->getType()); + EVT DstVT = VA.getValVT(); + // Special handling for extended integers. + if (SrcVT != DstVT) { + if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16) + return false; + + if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt()) + return false; + + assert(DstVT == MVT::i32 && "X86 should always ext to i32"); + + if (SrcVT == MVT::i1) { + if (Outs[0].Flags.isSExt()) + return false; + SrcReg = FastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); + SrcVT = MVT::i8; + } + unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : + ISD::SIGN_EXTEND; + SrcReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, + SrcReg, /*TODO: Kill=*/false); + } + + // Make the copy. + unsigned DstReg = VA.getLocReg(); + const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg); + // Avoid a cross-class copy. This is very unlikely. + if (!SrcRC->contains(DstReg)) + return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + DstReg).addReg(SrcReg); + + // Mark the register as live out of the function. + MRI.addLiveOut(VA.getLocReg()); + } + + // Now emit the RET. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::RET)); + return true; +} + +/// X86SelectLoad - Select and emit code to implement load instructions. +/// +bool X86FastISel::X86SelectLoad(const Instruction *I) { + // Atomic loads need special handling. + if (cast<LoadInst>(I)->isAtomic()) + return false; + + MVT VT; + if (!isTypeLegal(I->getType(), VT, /*AllowI1=*/true)) + return false; + + X86AddressMode AM; + if (!X86SelectAddress(I->getOperand(0), AM)) + return false; + + unsigned ResultReg = 0; + if (X86FastEmitLoad(VT, AM, ResultReg)) { + UpdateValueMap(I, ResultReg); + return true; + } + return false; +} + +static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { + bool HasAVX = Subtarget->hasAVX(); + bool X86ScalarSSEf32 = HasAVX || Subtarget->hasSSE1(); + bool X86ScalarSSEf64 = HasAVX || Subtarget->hasSSE2(); + + switch (VT.getSimpleVT().SimpleTy) { + default: return 0; + case MVT::i8: return X86::CMP8rr; + case MVT::i16: return X86::CMP16rr; + case MVT::i32: return X86::CMP32rr; + case MVT::i64: return X86::CMP64rr; + case MVT::f32: + return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0; + case MVT::f64: + return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0; + } +} + +/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS +/// of the comparison, return an opcode that works for the compare (e.g. +/// CMP32ri) otherwise return 0. +static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) { + switch (VT.getSimpleVT().SimpleTy) { + // Otherwise, we can't fold the immediate into this comparison. + default: return 0; + case MVT::i8: return X86::CMP8ri; + case MVT::i16: return X86::CMP16ri; + case MVT::i32: return X86::CMP32ri; + case MVT::i64: + // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext + // field. + if ((int)RHSC->getSExtValue() == RHSC->getSExtValue()) + return X86::CMP64ri32; + return 0; + } +} + +bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, + EVT VT) { + unsigned Op0Reg = getRegForValue(Op0); + if (Op0Reg == 0) return false; + + // Handle 'null' like i32/i64 0. + if (isa<ConstantPointerNull>(Op1)) + Op1 = Constant::getNullValue(TD.getIntPtrType(Op0->getContext())); + + // We have two options: compare with register or immediate. If the RHS of + // the compare is an immediate that we can fold into this compare, use + // CMPri, otherwise use CMPrr. + if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) { + if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CompareImmOpc)) + .addReg(Op0Reg) + .addImm(Op1C->getSExtValue()); + return true; + } + } + + unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget); + if (CompareOpc == 0) return false; + + unsigned Op1Reg = getRegForValue(Op1); + if (Op1Reg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CompareOpc)) + .addReg(Op0Reg) + .addReg(Op1Reg); + + return true; +} + +bool X86FastISel::X86SelectCmp(const Instruction *I) { + const CmpInst *CI = cast<CmpInst>(I); + + MVT VT; + if (!isTypeLegal(I->getOperand(0)->getType(), VT)) + return false; + + unsigned ResultReg = createResultReg(&X86::GR8RegClass); + unsigned SetCCOpc; + bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0. + switch (CI->getPredicate()) { + case CmpInst::FCMP_OEQ: { + if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT)) + return false; + + unsigned EReg = createResultReg(&X86::GR8RegClass); + unsigned NPReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETEr), EReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(X86::SETNPr), NPReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg); + UpdateValueMap(I, ResultReg); + return true; + } + case CmpInst::FCMP_UNE: { + if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT)) + return false; + + unsigned NEReg = createResultReg(&X86::GR8RegClass); + unsigned PReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETNEr), NEReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETPr), PReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::OR8rr),ResultReg) + .addReg(PReg).addReg(NEReg); + UpdateValueMap(I, ResultReg); + return true; + } + case CmpInst::FCMP_OGT: SwapArgs = false; SetCCOpc = X86::SETAr; break; + case CmpInst::FCMP_OGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break; + case CmpInst::FCMP_OLT: SwapArgs = true; SetCCOpc = X86::SETAr; break; + case CmpInst::FCMP_OLE: SwapArgs = true; SetCCOpc = X86::SETAEr; break; + case CmpInst::FCMP_ONE: SwapArgs = false; SetCCOpc = X86::SETNEr; break; + case CmpInst::FCMP_ORD: SwapArgs = false; SetCCOpc = X86::SETNPr; break; + case CmpInst::FCMP_UNO: SwapArgs = false; SetCCOpc = X86::SETPr; break; + case CmpInst::FCMP_UEQ: SwapArgs = false; SetCCOpc = X86::SETEr; break; + case CmpInst::FCMP_UGT: SwapArgs = true; SetCCOpc = X86::SETBr; break; + case CmpInst::FCMP_UGE: SwapArgs = true; SetCCOpc = X86::SETBEr; break; + case CmpInst::FCMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break; + case CmpInst::FCMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break; + + case CmpInst::ICMP_EQ: SwapArgs = false; SetCCOpc = X86::SETEr; break; + case CmpInst::ICMP_NE: SwapArgs = false; SetCCOpc = X86::SETNEr; break; + case CmpInst::ICMP_UGT: SwapArgs = false; SetCCOpc = X86::SETAr; break; + case CmpInst::ICMP_UGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break; + case CmpInst::ICMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr; break; + case CmpInst::ICMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break; + case CmpInst::ICMP_SGT: SwapArgs = false; SetCCOpc = X86::SETGr; break; + case CmpInst::ICMP_SGE: SwapArgs = false; SetCCOpc = X86::SETGEr; break; + case CmpInst::ICMP_SLT: SwapArgs = false; SetCCOpc = X86::SETLr; break; + case CmpInst::ICMP_SLE: SwapArgs = false; SetCCOpc = X86::SETLEr; break; + default: + return false; + } + + const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1); + if (SwapArgs) + std::swap(Op0, Op1); + + // Emit a compare of Op0/Op1. + if (!X86FastEmitCompare(Op0, Op1, VT)) + return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(SetCCOpc), ResultReg); + UpdateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86SelectZExt(const Instruction *I) { + // Handle zero-extension from i1 to i8, which is common. + if (!I->getOperand(0)->getType()->isIntegerTy(1)) + return false; + + EVT DstVT = TLI.getValueType(I->getType()); + if (!TLI.isTypeLegal(DstVT)) + return false; + + unsigned ResultReg = getRegForValue(I->getOperand(0)); + if (ResultReg == 0) + return false; + + // Set the high bits to zero. + ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); + if (ResultReg == 0) + return false; + + if (DstVT != MVT::i8) { + ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, + ResultReg, /*Kill=*/true); + if (ResultReg == 0) + return false; + } + + UpdateValueMap(I, ResultReg); + return true; +} + + +bool X86FastISel::X86SelectBranch(const Instruction *I) { + // Unconditional branches are selected by tablegen-generated code. + // Handle a conditional branch. + const BranchInst *BI = cast<BranchInst>(I); + MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; + MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; + + // Fold the common case of a conditional branch with a comparison + // in the same block (values defined on other blocks may not have + // initialized registers). + if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { + if (CI->hasOneUse() && CI->getParent() == I->getParent()) { + EVT VT = TLI.getValueType(CI->getOperand(0)->getType()); + + // Try to take advantage of fallthrough opportunities. + CmpInst::Predicate Predicate = CI->getPredicate(); + if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { + std::swap(TrueMBB, FalseMBB); + Predicate = CmpInst::getInversePredicate(Predicate); + } + + bool SwapArgs; // false -> compare Op0, Op1. true -> compare Op1, Op0. + unsigned BranchOpc; // Opcode to jump on, e.g. "X86::JA" + + switch (Predicate) { + case CmpInst::FCMP_OEQ: + std::swap(TrueMBB, FalseMBB); + Predicate = CmpInst::FCMP_UNE; + // FALL THROUGH + case CmpInst::FCMP_UNE: SwapArgs = false; BranchOpc = X86::JNE_4; break; + case CmpInst::FCMP_OGT: SwapArgs = false; BranchOpc = X86::JA_4; break; + case CmpInst::FCMP_OGE: SwapArgs = false; BranchOpc = X86::JAE_4; break; + case CmpInst::FCMP_OLT: SwapArgs = true; BranchOpc = X86::JA_4; break; + case CmpInst::FCMP_OLE: SwapArgs = true; BranchOpc = X86::JAE_4; break; + case CmpInst::FCMP_ONE: SwapArgs = false; BranchOpc = X86::JNE_4; break; + case CmpInst::FCMP_ORD: SwapArgs = false; BranchOpc = X86::JNP_4; break; + case CmpInst::FCMP_UNO: SwapArgs = false; BranchOpc = X86::JP_4; break; + case CmpInst::FCMP_UEQ: SwapArgs = false; BranchOpc = X86::JE_4; break; + case CmpInst::FCMP_UGT: SwapArgs = true; BranchOpc = X86::JB_4; break; + case CmpInst::FCMP_UGE: SwapArgs = true; BranchOpc = X86::JBE_4; break; + case CmpInst::FCMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break; + case CmpInst::FCMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break; + + case CmpInst::ICMP_EQ: SwapArgs = false; BranchOpc = X86::JE_4; break; + case CmpInst::ICMP_NE: SwapArgs = false; BranchOpc = X86::JNE_4; break; + case CmpInst::ICMP_UGT: SwapArgs = false; BranchOpc = X86::JA_4; break; + case CmpInst::ICMP_UGE: SwapArgs = false; BranchOpc = X86::JAE_4; break; + case CmpInst::ICMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4; break; + case CmpInst::ICMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break; + case CmpInst::ICMP_SGT: SwapArgs = false; BranchOpc = X86::JG_4; break; + case CmpInst::ICMP_SGE: SwapArgs = false; BranchOpc = X86::JGE_4; break; + case CmpInst::ICMP_SLT: SwapArgs = false; BranchOpc = X86::JL_4; break; + case CmpInst::ICMP_SLE: SwapArgs = false; BranchOpc = X86::JLE_4; break; + default: + return false; + } + + const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1); + if (SwapArgs) + std::swap(Op0, Op1); + + // Emit a compare of the LHS and RHS, setting the flags. + if (!X86FastEmitCompare(Op0, Op1, VT)) + return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BranchOpc)) + .addMBB(TrueMBB); + + if (Predicate == CmpInst::FCMP_UNE) { + // X86 requires a second branch to handle UNE (and OEQ, + // which is mapped to UNE above). + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JP_4)) + .addMBB(TrueMBB); + } + + FastEmitBranch(FalseMBB, DL); + FuncInfo.MBB->addSuccessor(TrueMBB); + return true; + } + } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { + // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which + // typically happen for _Bool and C++ bools. + MVT SourceVT; + if (TI->hasOneUse() && TI->getParent() == I->getParent() && + isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) { + unsigned TestOpc = 0; + switch (SourceVT.SimpleTy) { + default: break; + case MVT::i8: TestOpc = X86::TEST8ri; break; + case MVT::i16: TestOpc = X86::TEST16ri; break; + case MVT::i32: TestOpc = X86::TEST32ri; break; + case MVT::i64: TestOpc = X86::TEST64ri32; break; + } + if (TestOpc) { + unsigned OpReg = getRegForValue(TI->getOperand(0)); + if (OpReg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TestOpc)) + .addReg(OpReg).addImm(1); + + unsigned JmpOpc = X86::JNE_4; + if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { + std::swap(TrueMBB, FalseMBB); + JmpOpc = X86::JE_4; + } + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(JmpOpc)) + .addMBB(TrueMBB); + FastEmitBranch(FalseMBB, DL); + FuncInfo.MBB->addSuccessor(TrueMBB); + return true; + } + } + } + + // Otherwise do a clumsy setcc and re-test it. + // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used + // in an explicit cast, so make sure to handle that correctly. + unsigned OpReg = getRegForValue(BI->getCondition()); + if (OpReg == 0) return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8ri)) + .addReg(OpReg).addImm(1); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JNE_4)) + .addMBB(TrueMBB); + FastEmitBranch(FalseMBB, DL); + FuncInfo.MBB->addSuccessor(TrueMBB); + return true; +} + +bool X86FastISel::X86SelectShift(const Instruction *I) { + unsigned CReg = 0, OpReg = 0; + const TargetRegisterClass *RC = NULL; + if (I->getType()->isIntegerTy(8)) { + CReg = X86::CL; + RC = &X86::GR8RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR8rCL; break; + case Instruction::AShr: OpReg = X86::SAR8rCL; break; + case Instruction::Shl: OpReg = X86::SHL8rCL; break; + default: return false; + } + } else if (I->getType()->isIntegerTy(16)) { + CReg = X86::CX; + RC = &X86::GR16RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR16rCL; break; + case Instruction::AShr: OpReg = X86::SAR16rCL; break; + case Instruction::Shl: OpReg = X86::SHL16rCL; break; + default: return false; + } + } else if (I->getType()->isIntegerTy(32)) { + CReg = X86::ECX; + RC = &X86::GR32RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR32rCL; break; + case Instruction::AShr: OpReg = X86::SAR32rCL; break; + case Instruction::Shl: OpReg = X86::SHL32rCL; break; + default: return false; + } + } else if (I->getType()->isIntegerTy(64)) { + CReg = X86::RCX; + RC = &X86::GR64RegClass; + switch (I->getOpcode()) { + case Instruction::LShr: OpReg = X86::SHR64rCL; break; + case Instruction::AShr: OpReg = X86::SAR64rCL; break; + case Instruction::Shl: OpReg = X86::SHL64rCL; break; + default: return false; + } + } else { + return false; + } + + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + unsigned Op0Reg = getRegForValue(I->getOperand(0)); + if (Op0Reg == 0) return false; + + unsigned Op1Reg = getRegForValue(I->getOperand(1)); + if (Op1Reg == 0) return false; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + CReg).addReg(Op1Reg); + + // The shift instruction uses X86::CL. If we defined a super-register + // of X86::CL, emit a subreg KILL to precisely describe what we're doing here. + if (CReg != X86::CL) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(TargetOpcode::KILL), X86::CL) + .addReg(CReg, RegState::Kill); + + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpReg), ResultReg) + .addReg(Op0Reg); + UpdateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86SelectSelect(const Instruction *I) { + MVT VT; + if (!isTypeLegal(I->getType(), VT)) + return false; + + // We only use cmov here, if we don't have a cmov instruction bail. + if (!Subtarget->hasCMov()) return false; + + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + if (VT == MVT::i16) { + Opc = X86::CMOVE16rr; + RC = &X86::GR16RegClass; + } else if (VT == MVT::i32) { + Opc = X86::CMOVE32rr; + RC = &X86::GR32RegClass; + } else if (VT == MVT::i64) { + Opc = X86::CMOVE64rr; + RC = &X86::GR64RegClass; + } else { + return false; + } + + unsigned Op0Reg = getRegForValue(I->getOperand(0)); + if (Op0Reg == 0) return false; + unsigned Op1Reg = getRegForValue(I->getOperand(1)); + if (Op1Reg == 0) return false; + unsigned Op2Reg = getRegForValue(I->getOperand(2)); + if (Op2Reg == 0) return false; + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8rr)) + .addReg(Op0Reg).addReg(Op0Reg); + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg) + .addReg(Op1Reg).addReg(Op2Reg); + UpdateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::X86SelectFPExt(const Instruction *I) { + // fpext from float to double. + if (X86ScalarSSEf64 && + I->getType()->isDoubleTy()) { + const Value *V = I->getOperand(0); + if (V->getType()->isFloatTy()) { + unsigned OpReg = getRegForValue(V); + if (OpReg == 0) return false; + unsigned ResultReg = createResultReg(X86::FR64RegisterClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(X86::CVTSS2SDrr), ResultReg) + .addReg(OpReg); + UpdateValueMap(I, ResultReg); + return true; + } + } + + return false; +} + +bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { + if (X86ScalarSSEf64) { + if (I->getType()->isFloatTy()) { + const Value *V = I->getOperand(0); + if (V->getType()->isDoubleTy()) { + unsigned OpReg = getRegForValue(V); + if (OpReg == 0) return false; + unsigned ResultReg = createResultReg(X86::FR32RegisterClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(X86::CVTSD2SSrr), ResultReg) + .addReg(OpReg); + UpdateValueMap(I, ResultReg); + return true; + } + } + } + + return false; +} + +bool X86FastISel::X86SelectTrunc(const Instruction *I) { + EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(I->getType()); + + // This code only handles truncation to byte. + if (DstVT != MVT::i8 && DstVT != MVT::i1) + return false; + if (!TLI.isTypeLegal(SrcVT)) + return false; + + unsigned InputReg = getRegForValue(I->getOperand(0)); + if (!InputReg) + // Unhandled operand. Halt "fast" selection and bail. + return false; + + if (SrcVT == MVT::i8) { + // Truncate from i8 to i1; no code needed. + UpdateValueMap(I, InputReg); + return true; + } + + if (!Subtarget->is64Bit()) { + // If we're on x86-32; we can't extract an i8 from a general register. + // First issue a copy to GR16_ABCD or GR32_ABCD. + const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) + ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass; + unsigned CopyReg = createResultReg(CopyRC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + CopyReg).addReg(InputReg); + InputReg = CopyReg; + } + + // Issue an extract_subreg. + unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8, + InputReg, /*Kill=*/true, + X86::sub_8bit); + if (!ResultReg) + return false; + + UpdateValueMap(I, ResultReg); + return true; +} + +bool X86FastISel::IsMemcpySmall(uint64_t Len) { + return Len <= (Subtarget->is64Bit() ? 32 : 16); +} + +bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, + X86AddressMode SrcAM, uint64_t Len) { + + // Make sure we don't bloat code by inlining very large memcpy's. + if (!IsMemcpySmall(Len)) + return false; + + bool i64Legal = Subtarget->is64Bit(); + + // We don't care about alignment here since we just emit integer accesses. + while (Len) { + MVT VT; + if (Len >= 8 && i64Legal) + VT = MVT::i64; + else if (Len >= 4) + VT = MVT::i32; + else if (Len >= 2) + VT = MVT::i16; + else { + assert(Len == 1); + VT = MVT::i8; + } + + unsigned Reg; + bool RV = X86FastEmitLoad(VT, SrcAM, Reg); + RV &= X86FastEmitStore(VT, Reg, DestAM); + assert(RV && "Failed to emit load or store??"); + + unsigned Size = VT.getSizeInBits()/8; + Len -= Size; + DestAM.Disp += Size; + SrcAM.Disp += Size; + } + + return true; +} + +bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) { + // FIXME: Handle more intrinsics. + switch (I.getIntrinsicID()) { + default: return false; + case Intrinsic::memcpy: { + const MemCpyInst &MCI = cast<MemCpyInst>(I); + // Don't handle volatile or variable length memcpys. + if (MCI.isVolatile()) + return false; + + if (isa<ConstantInt>(MCI.getLength())) { + // Small memcpy's are common enough that we want to do them + // without a call if possible. + uint64_t Len = cast<ConstantInt>(MCI.getLength())->getZExtValue(); + if (IsMemcpySmall(Len)) { + X86AddressMode DestAM, SrcAM; + if (!X86SelectAddress(MCI.getRawDest(), DestAM) || + !X86SelectAddress(MCI.getRawSource(), SrcAM)) + return false; + TryEmitSmallMemcpy(DestAM, SrcAM, Len); + return true; + } + } + + unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; + if (!MCI.getLength()->getType()->isIntegerTy(SizeWidth)) + return false; + + if (MCI.getSourceAddressSpace() > 255 || MCI.getDestAddressSpace() > 255) + return false; + + return DoSelectCall(&I, "memcpy"); + } + case Intrinsic::memset: { + const MemSetInst &MSI = cast<MemSetInst>(I); + + if (MSI.isVolatile()) + return false; + + unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; + if (!MSI.getLength()->getType()->isIntegerTy(SizeWidth)) + return false; + + if (MSI.getDestAddressSpace() > 255) + return false; + + return DoSelectCall(&I, "memset"); + } + case Intrinsic::stackprotector: { + // Emit code inline code to store the stack guard onto the stack. + EVT PtrTy = TLI.getPointerTy(); + + const Value *Op1 = I.getArgOperand(0); // The guard's value. + const AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1)); + + // Grab the frame index. + X86AddressMode AM; + if (!X86SelectAddress(Slot, AM)) return false; + if (!X86FastEmitStore(PtrTy, Op1, AM)) return false; + return true; + } + case Intrinsic::dbg_declare: { + const DbgDeclareInst *DI = cast<DbgDeclareInst>(&I); + X86AddressMode AM; + assert(DI->getAddress() && "Null address should be checked earlier!"); + if (!X86SelectAddress(DI->getAddress(), AM)) + return false; + const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE); + // FIXME may need to add RegState::Debug to any registers produced, + // although ESP/EBP should be the only ones at the moment. + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II), AM). + addImm(0).addMetadata(DI->getVariable()); + return true; + } + case Intrinsic::trap: { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TRAP)); + return true; + } + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: { + // FIXME: Should fold immediates. + + // Replace "add with overflow" intrinsics with an "add" instruction followed + // by a seto/setc instruction. + const Function *Callee = I.getCalledFunction(); + Type *RetTy = + cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0)); + + MVT VT; + if (!isTypeLegal(RetTy, VT)) + return false; + + const Value *Op1 = I.getArgOperand(0); + const Value *Op2 = I.getArgOperand(1); + unsigned Reg1 = getRegForValue(Op1); + unsigned Reg2 = getRegForValue(Op2); + + if (Reg1 == 0 || Reg2 == 0) + // FIXME: Handle values *not* in registers. + return false; + + unsigned OpC = 0; + if (VT == MVT::i32) + OpC = X86::ADD32rr; + else if (VT == MVT::i64) + OpC = X86::ADD64rr; + else + return false; + + // The call to CreateRegs builds two sequential registers, to store the + // both the the returned values. + unsigned ResultReg = FuncInfo.CreateRegs(I.getType()); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg) + .addReg(Reg1).addReg(Reg2); + + unsigned Opc = X86::SETBr; + if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow) + Opc = X86::SETOr; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg+1); + + UpdateValueMap(&I, ResultReg, 2); + return true; + } + } +} + +bool X86FastISel::X86SelectCall(const Instruction *I) { + const CallInst *CI = cast<CallInst>(I); + const Value *Callee = CI->getCalledValue(); + + // Can't handle inline asm yet. + if (isa<InlineAsm>(Callee)) + return false; + + // Handle intrinsic calls. + if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) + return X86VisitIntrinsicCall(*II); + + return DoSelectCall(I, 0); +} + +// Select either a call, or an llvm.memcpy/memmove/memset intrinsic +bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) { + const CallInst *CI = cast<CallInst>(I); + const Value *Callee = CI->getCalledValue(); + + // Handle only C and fastcc calling conventions for now. + ImmutableCallSite CS(CI); + CallingConv::ID CC = CS.getCallingConv(); + if (CC != CallingConv::C && CC != CallingConv::Fast && + CC != CallingConv::X86_FastCall) + return false; + + // fastcc with -tailcallopt is intended to provide a guaranteed + // tail call optimization. Fastisel doesn't know how to do that. + if (CC == CallingConv::Fast && GuaranteedTailCallOpt) + return false; + + PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType()); + FunctionType *FTy = cast<FunctionType>(PT->getElementType()); + bool isVarArg = FTy->isVarArg(); + + // Don't know how to handle Win64 varargs yet. Nothing special needed for + // x86-32. Special handling for x86-64 is implemented. + if (isVarArg && Subtarget->isTargetWin64()) + return false; + + // Fast-isel doesn't know about callee-pop yet. + if (X86::isCalleePop(CC, Subtarget->is64Bit(), isVarArg, + GuaranteedTailCallOpt)) + return false; + + // Check whether the function can return without sret-demotion. + SmallVector<ISD::OutputArg, 4> Outs; + SmallVector<uint64_t, 4> Offsets; + GetReturnInfo(I->getType(), CS.getAttributes().getRetAttributes(), + Outs, TLI, &Offsets); + bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(), + *FuncInfo.MF, FTy->isVarArg(), + Outs, FTy->getContext()); + if (!CanLowerReturn) + return false; + + // Materialize callee address in a register. FIXME: GV address can be + // handled with a CALLpcrel32 instead. + X86AddressMode CalleeAM; + if (!X86SelectCallAddress(Callee, CalleeAM)) + return false; + unsigned CalleeOp = 0; + const GlobalValue *GV = 0; + if (CalleeAM.GV != 0) { + GV = CalleeAM.GV; + } else if (CalleeAM.Base.Reg != 0) { + CalleeOp = CalleeAM.Base.Reg; + } else + return false; + + // Deal with call operands first. + SmallVector<const Value *, 8> ArgVals; + SmallVector<unsigned, 8> Args; + SmallVector<MVT, 8> ArgVTs; + SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; + Args.reserve(CS.arg_size()); + ArgVals.reserve(CS.arg_size()); + ArgVTs.reserve(CS.arg_size()); + ArgFlags.reserve(CS.arg_size()); + for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); + i != e; ++i) { + // If we're lowering a mem intrinsic instead of a regular call, skip the + // last two arguments, which should not passed to the underlying functions. + if (MemIntName && e-i <= 2) + break; + Value *ArgVal = *i; + ISD::ArgFlagsTy Flags; + unsigned AttrInd = i - CS.arg_begin() + 1; + if (CS.paramHasAttr(AttrInd, Attribute::SExt)) + Flags.setSExt(); + if (CS.paramHasAttr(AttrInd, Attribute::ZExt)) + Flags.setZExt(); + + if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) { + PointerType *Ty = cast<PointerType>(ArgVal->getType()); + Type *ElementTy = Ty->getElementType(); + unsigned FrameSize = TD.getTypeAllocSize(ElementTy); + unsigned FrameAlign = CS.getParamAlignment(AttrInd); + if (!FrameAlign) + FrameAlign = TLI.getByValTypeAlignment(ElementTy); + Flags.setByVal(); + Flags.setByValSize(FrameSize); + Flags.setByValAlign(FrameAlign); + if (!IsMemcpySmall(FrameSize)) + return false; + } + + if (CS.paramHasAttr(AttrInd, Attribute::InReg)) + Flags.setInReg(); + if (CS.paramHasAttr(AttrInd, Attribute::Nest)) + Flags.setNest(); + + // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra + // instruction. This is safe because it is common to all fastisel supported + // calling conventions on x86. + if (ConstantInt *CI = dyn_cast<ConstantInt>(ArgVal)) { + if (CI->getBitWidth() == 1 || CI->getBitWidth() == 8 || + CI->getBitWidth() == 16) { + if (Flags.isSExt()) + ArgVal = ConstantExpr::getSExt(CI,Type::getInt32Ty(CI->getContext())); + else + ArgVal = ConstantExpr::getZExt(CI,Type::getInt32Ty(CI->getContext())); + } + } + + unsigned ArgReg; + + // Passing bools around ends up doing a trunc to i1 and passing it. + // Codegen this as an argument + "and 1". + if (ArgVal->getType()->isIntegerTy(1) && isa<TruncInst>(ArgVal) && + cast<TruncInst>(ArgVal)->getParent() == I->getParent() && + ArgVal->hasOneUse()) { + ArgVal = cast<TruncInst>(ArgVal)->getOperand(0); + ArgReg = getRegForValue(ArgVal); + if (ArgReg == 0) return false; + + MVT ArgVT; + if (!isTypeLegal(ArgVal->getType(), ArgVT)) return false; + + ArgReg = FastEmit_ri(ArgVT, ArgVT, ISD::AND, ArgReg, + ArgVal->hasOneUse(), 1); + } else { + ArgReg = getRegForValue(ArgVal); + } + + if (ArgReg == 0) return false; + + Type *ArgTy = ArgVal->getType(); + MVT ArgVT; + if (!isTypeLegal(ArgTy, ArgVT)) + return false; + if (ArgVT == MVT::x86mmx) + return false; + unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy); + Flags.setOrigAlign(OriginalAlignment); + + Args.push_back(ArgReg); + ArgVals.push_back(ArgVal); + ArgVTs.push_back(ArgVT); + ArgFlags.push_back(Flags); + } + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs, + I->getParent()->getContext()); + + // Allocate shadow area for Win64 + if (Subtarget->isTargetWin64()) + CCInfo.AllocateStack(32, 8); + + CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + + // Issue CALLSEQ_START + unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackDown)) + .addImm(NumBytes); + + // Process argument: walk the register/memloc assignments, inserting + // copies / loads. + SmallVector<unsigned, 4> RegArgs; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + unsigned Arg = Args[VA.getValNo()]; + EVT ArgVT = ArgVTs[VA.getValNo()]; + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::SExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); + bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + assert(Emitted && "Failed to emit a sext!"); (void)Emitted; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::ZExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); + bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + assert(Emitted && "Failed to emit a zext!"); (void)Emitted; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::AExt: { + assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && + "Unexpected extend"); + bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + if (!Emitted) + Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + if (!Emitted) + Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), + Arg, ArgVT, Arg); + + assert(Emitted && "Failed to emit a aext!"); (void)Emitted; + ArgVT = VA.getLocVT(); + break; + } + case CCValAssign::BCvt: { + unsigned BC = FastEmit_r(ArgVT.getSimpleVT(), VA.getLocVT(), + ISD::BITCAST, Arg, /*TODO: Kill=*/false); + assert(BC != 0 && "Failed to emit a bitcast!"); + Arg = BC; + ArgVT = VA.getLocVT(); + break; + } + } + + if (VA.isRegLoc()) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + VA.getLocReg()).addReg(Arg); + RegArgs.push_back(VA.getLocReg()); + } else { + unsigned LocMemOffset = VA.getLocMemOffset(); + X86AddressMode AM; + AM.Base.Reg = StackPtr; + AM.Disp = LocMemOffset; + const Value *ArgVal = ArgVals[VA.getValNo()]; + ISD::ArgFlagsTy Flags = ArgFlags[VA.getValNo()]; + + if (Flags.isByVal()) { + X86AddressMode SrcAM; + SrcAM.Base.Reg = Arg; + bool Res = TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()); + assert(Res && "memcpy length already checked!"); (void)Res; + } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) { + // If this is a really simple value, emit this with the Value* version + // of X86FastEmitStore. If it isn't simple, we don't want to do this, + // as it can cause us to reevaluate the argument. + X86FastEmitStore(ArgVT, ArgVal, AM); + } else { + X86FastEmitStore(ArgVT, Arg, AM); + } + } + } + + // ELF / PIC requires GOT in the EBX register before function calls via PLT + // GOT pointer. + if (Subtarget->isPICStyleGOT()) { + unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + X86::EBX).addReg(Base); + } + + if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) { + // Count the number of XMM registers allocated. + static const unsigned XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::MOV8ri), + X86::AL).addImm(NumXMMRegs); + } + + // Issue the call. + MachineInstrBuilder MIB; + if (CalleeOp) { + // Register-indirect call. + unsigned CallOpc; + if (Subtarget->isTargetWin64()) + CallOpc = X86::WINCALL64r; + else if (Subtarget->is64Bit()) + CallOpc = X86::CALL64r; + else + CallOpc = X86::CALL32r; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)) + .addReg(CalleeOp); + + } else { + // Direct call. + assert(GV && "Not a direct call"); + unsigned CallOpc; + if (Subtarget->isTargetWin64()) + CallOpc = X86::WINCALL64pcrel32; + else if (Subtarget->is64Bit()) + CallOpc = X86::CALL64pcrel32; + else + CallOpc = X86::CALLpcrel32; + + // See if we need any target-specific flags on the GV operand. + unsigned char OpFlags = 0; + + // On ELF targets, in both X86-64 and X86-32 mode, direct calls to + // external symbols most go through the PLT in PIC mode. If the symbol + // has hidden or protected visibility, or if it is static or local, then + // we don't need to use the PLT - we can directly call it. + if (Subtarget->isTargetELF() && + TM.getRelocationModel() == Reloc::PIC_ && + GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { + OpFlags = X86II::MO_PLT; + } else if (Subtarget->isPICStyleStubAny() && + (GV->isDeclaration() || GV->isWeakForLinker()) && + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = X86II::MO_DARWIN_STUB; + } + + + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc)); + if (MemIntName) + MIB.addExternalSymbol(MemIntName, OpFlags); + else + MIB.addGlobalAddress(GV, 0, OpFlags); + } + + // Add an implicit use GOT pointer in EBX. + if (Subtarget->isPICStyleGOT()) + MIB.addReg(X86::EBX); + + if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) + MIB.addReg(X86::AL); + + // Add implicit physical register uses to the call. + for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) + MIB.addReg(RegArgs[i]); + + // Issue CALLSEQ_END + unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); + unsigned NumBytesCallee = 0; + if (!Subtarget->is64Bit() && CS.paramHasAttr(1, Attribute::StructRet)) + NumBytesCallee = 4; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp)) + .addImm(NumBytes).addImm(NumBytesCallee); + + // Build info for return calling conv lowering code. + // FIXME: This is practically a copy-paste from TargetLowering::LowerCallTo. + SmallVector<ISD::InputArg, 32> Ins; + SmallVector<EVT, 4> RetTys; + ComputeValueVTs(TLI, I->getType(), RetTys); + for (unsigned i = 0, e = RetTys.size(); i != e; ++i) { + EVT VT = RetTys[i]; + EVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT); + unsigned NumRegs = TLI.getNumRegisters(I->getParent()->getContext(), VT); + for (unsigned j = 0; j != NumRegs; ++j) { + ISD::InputArg MyFlags; + MyFlags.VT = RegisterVT.getSimpleVT(); + MyFlags.Used = !CS.getInstruction()->use_empty(); + if (CS.paramHasAttr(0, Attribute::SExt)) + MyFlags.Flags.setSExt(); + if (CS.paramHasAttr(0, Attribute::ZExt)) + MyFlags.Flags.setZExt(); + if (CS.paramHasAttr(0, Attribute::InReg)) + MyFlags.Flags.setInReg(); + Ins.push_back(MyFlags); + } + } + + // Now handle call return values. + SmallVector<unsigned, 4> UsedRegs; + SmallVector<CCValAssign, 16> RVLocs; + CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs, + I->getParent()->getContext()); + unsigned ResultReg = FuncInfo.CreateRegs(I->getType()); + CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); + for (unsigned i = 0; i != RVLocs.size(); ++i) { + EVT CopyVT = RVLocs[i].getValVT(); + unsigned CopyReg = ResultReg + i; + + // If this is a call to a function that returns an fp value on the x87 fp + // stack, but where we prefer to use the value in xmm registers, copy it + // out as F80 and use a truncate to move it from fp stack reg to xmm reg. + if ((RVLocs[i].getLocReg() == X86::ST0 || + RVLocs[i].getLocReg() == X86::ST1)) { + if (isScalarFPTypeInSSEReg(RVLocs[i].getValVT())) { + CopyVT = MVT::f80; + CopyReg = createResultReg(X86::RFP80RegisterClass); + } + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::FpPOP_RETVAL), + CopyReg); + } else { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY), + CopyReg).addReg(RVLocs[i].getLocReg()); + UsedRegs.push_back(RVLocs[i].getLocReg()); + } + + if (CopyVT != RVLocs[i].getValVT()) { + // Round the F80 the right size, which also moves to the appropriate xmm + // register. This is accomplished by storing the F80 value in memory and + // then loading it back. Ewww... + EVT ResVT = RVLocs[i].getValVT(); + unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; + unsigned MemSize = ResVT.getSizeInBits()/8; + int FI = MFI.CreateStackObject(MemSize, MemSize, false); + addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc)), FI) + .addReg(CopyReg); + Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; + addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg + i), FI); + } + } + + if (RVLocs.size()) + UpdateValueMap(I, ResultReg, RVLocs.size()); + + // Set all unused physreg defs as dead. + static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); + + return true; +} + + +bool +X86FastISel::TargetSelectInstruction(const Instruction *I) { + switch (I->getOpcode()) { + default: break; + case Instruction::Load: + return X86SelectLoad(I); + case Instruction::Store: + return X86SelectStore(I); + case Instruction::Ret: + return X86SelectRet(I); + case Instruction::ICmp: + case Instruction::FCmp: + return X86SelectCmp(I); + case Instruction::ZExt: + return X86SelectZExt(I); + case Instruction::Br: + return X86SelectBranch(I); + case Instruction::Call: + return X86SelectCall(I); + case Instruction::LShr: + case Instruction::AShr: + case Instruction::Shl: + return X86SelectShift(I); + case Instruction::Select: + return X86SelectSelect(I); + case Instruction::Trunc: + return X86SelectTrunc(I); + case Instruction::FPExt: + return X86SelectFPExt(I); + case Instruction::FPTrunc: + return X86SelectFPTrunc(I); + case Instruction::IntToPtr: // Deliberate fall-through. + case Instruction::PtrToInt: { + EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(I->getType()); + if (DstVT.bitsGT(SrcVT)) + return X86SelectZExt(I); + if (DstVT.bitsLT(SrcVT)) + return X86SelectTrunc(I); + unsigned Reg = getRegForValue(I->getOperand(0)); + if (Reg == 0) return false; + UpdateValueMap(I, Reg); + return true; + } + } + + return false; +} + +unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { + MVT VT; + if (!isTypeLegal(C->getType(), VT)) + return false; + + // Get opcode and regclass of the output for the given load instruction. + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + switch (VT.SimpleTy) { + default: return false; + case MVT::i8: + Opc = X86::MOV8rm; + RC = X86::GR8RegisterClass; + break; + case MVT::i16: + Opc = X86::MOV16rm; + RC = X86::GR16RegisterClass; + break; + case MVT::i32: + Opc = X86::MOV32rm; + RC = X86::GR32RegisterClass; + break; + case MVT::i64: + // Must be in x86-64 mode. + Opc = X86::MOV64rm; + RC = X86::GR64RegisterClass; + break; + case MVT::f32: + if (X86ScalarSSEf32) { + Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; + RC = X86::FR32RegisterClass; + } else { + Opc = X86::LD_Fp32m; + RC = X86::RFP32RegisterClass; + } + break; + case MVT::f64: + if (X86ScalarSSEf64) { + Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; + RC = X86::FR64RegisterClass; + } else { + Opc = X86::LD_Fp64m; + RC = X86::RFP64RegisterClass; + } + break; + case MVT::f80: + // No f80 support yet. + return false; + } + + // Materialize addresses with LEA instructions. + if (isa<GlobalValue>(C)) { + X86AddressMode AM; + if (X86SelectAddress(C, AM)) { + // If the expression is just a basereg, then we're done, otherwise we need + // to emit an LEA. + if (AM.BaseType == X86AddressMode::RegBase && + AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == 0) + return AM.Base.Reg; + + Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; + unsigned ResultReg = createResultReg(RC); + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg), AM); + return ResultReg; + } + return 0; + } + + // MachineConstantPool wants an explicit alignment. + unsigned Align = TD.getPrefTypeAlignment(C->getType()); + if (Align == 0) { + // Alignment of vector types. FIXME! + Align = TD.getTypeAllocSize(C->getType()); + } + + // x86-32 PIC requires a PIC base register for constant pools. + unsigned PICBase = 0; + unsigned char OpFlag = 0; + if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic + OpFlag = X86II::MO_PIC_BASE_OFFSET; + PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } else if (Subtarget->isPICStyleGOT()) { + OpFlag = X86II::MO_GOTOFF; + PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); + } else if (Subtarget->isPICStyleRIPRel() && + TM.getCodeModel() == CodeModel::Small) { + PICBase = X86::RIP; + } + + // Create the load from the constant pool. + unsigned MCPOffset = MCP.getConstantPoolIndex(C, Align); + unsigned ResultReg = createResultReg(RC); + addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg), + MCPOffset, PICBase, OpFlag); + + return ResultReg; +} + +unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) { + // Fail on dynamic allocas. At this point, getRegForValue has already + // checked its CSE maps, so if we're here trying to handle a dynamic + // alloca, we're not going to succeed. X86SelectAddress has a + // check for dynamic allocas, because it's called directly from + // various places, but TargetMaterializeAlloca also needs a check + // in order to avoid recursion between getRegForValue, + // X86SelectAddrss, and TargetMaterializeAlloca. + if (!FuncInfo.StaticAllocaMap.count(C)) + return 0; + + X86AddressMode AM; + if (!X86SelectAddress(C, AM)) + return 0; + unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; + TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy()); + unsigned ResultReg = createResultReg(RC); + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, + TII.get(Opc), ResultReg), AM); + return ResultReg; +} + +unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { + MVT VT; + if (!isTypeLegal(CF->getType(), VT)) + return false; + + // Get opcode and regclass for the given zero. + unsigned Opc = 0; + const TargetRegisterClass *RC = NULL; + switch (VT.SimpleTy) { + default: return false; + case MVT::f32: + if (X86ScalarSSEf32) { + Opc = Subtarget->hasAVX() ? X86::VFsFLD0SS : X86::FsFLD0SS; + RC = X86::FR32RegisterClass; + } else { + Opc = X86::LD_Fp032; + RC = X86::RFP32RegisterClass; + } + break; + case MVT::f64: + if (X86ScalarSSEf64) { + Opc = Subtarget->hasAVX() ? X86::VFsFLD0SD : X86::FsFLD0SD; + RC = X86::FR64RegisterClass; + } else { + Opc = X86::LD_Fp064; + RC = X86::RFP64RegisterClass; + } + break; + case MVT::f80: + // No f80 support yet. + return false; + } + + unsigned ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg); + return ResultReg; +} + + +/// TryToFoldLoad - The specified machine instr operand is a vreg, and that +/// vreg is being provided by the specified load instruction. If possible, +/// try to fold the load as an operand to the instruction, returning true if +/// possible. +bool X86FastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo, + const LoadInst *LI) { + X86AddressMode AM; + if (!X86SelectAddress(LI->getOperand(0), AM)) + return false; + + X86InstrInfo &XII = (X86InstrInfo&)TII; + + unsigned Size = TD.getTypeAllocSize(LI->getType()); + unsigned Alignment = LI->getAlignment(); + + SmallVector<MachineOperand, 8> AddrOps; + AM.getFullAddress(AddrOps); + + MachineInstr *Result = + XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment); + if (Result == 0) return false; + + FuncInfo.MBB->insert(FuncInfo.InsertPt, Result); + MI->eraseFromParent(); + return true; +} + + +namespace llvm { + llvm::FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo) { + return new X86FastISel(funcInfo); + } +} diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp new file mode 100644 index 0000000..e3461c8 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -0,0 +1,1743 @@ +//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which converts floating point instructions from +// pseudo registers into register stack instructions. This pass uses live +// variable information to indicate where the FPn registers are used and their +// lifetimes. +// +// The x87 hardware tracks liveness of the stack registers, so it is necessary +// to implement exact liveness tracking between basic blocks. The CFG edges are +// partitioned into bundles where the same FP registers must be live in +// identical stack positions. Instructions are inserted at the end of each basic +// block to rearrange the live registers to match the outgoing bundle. +// +// This approach avoids splitting critical edges at the potential cost of more +// live register shuffling instructions when critical edges are present. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-codegen" +#include "X86.h" +#include "X86InstrInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/EdgeBundles.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/InlineAsm.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include <algorithm> +using namespace llvm; + +STATISTIC(NumFXCH, "Number of fxch instructions inserted"); +STATISTIC(NumFP , "Number of floating point instructions"); + +namespace { + struct FPS : public MachineFunctionPass { + static char ID; + FPS() : MachineFunctionPass(ID) { + initializeEdgeBundlesPass(*PassRegistry::getPassRegistry()); + // This is really only to keep valgrind quiet. + // The logic in isLive() is too much for it. + memset(Stack, 0, sizeof(Stack)); + memset(RegMap, 0, sizeof(RegMap)); + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<EdgeBundles>(); + AU.addPreservedID(MachineLoopInfoID); + AU.addPreservedID(MachineDominatorsID); + MachineFunctionPass::getAnalysisUsage(AU); + } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { return "X86 FP Stackifier"; } + + private: + const TargetInstrInfo *TII; // Machine instruction info. + + // Two CFG edges are related if they leave the same block, or enter the same + // block. The transitive closure of an edge under this relation is a + // LiveBundle. It represents a set of CFG edges where the live FP stack + // registers must be allocated identically in the x87 stack. + // + // A LiveBundle is usually all the edges leaving a block, or all the edges + // entering a block, but it can contain more edges if critical edges are + // present. + // + // The set of live FP registers in a LiveBundle is calculated by bundleCFG, + // but the exact mapping of FP registers to stack slots is fixed later. + struct LiveBundle { + // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c. + unsigned Mask; + + // Number of pre-assigned live registers in FixStack. This is 0 when the + // stack order has not yet been fixed. + unsigned FixCount; + + // Assigned stack order for live-in registers. + // FixStack[i] == getStackEntry(i) for all i < FixCount. + unsigned char FixStack[8]; + + LiveBundle() : Mask(0), FixCount(0) {} + + // Have the live registers been assigned a stack order yet? + bool isFixed() const { return !Mask || FixCount; } + }; + + // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges + // with no live FP registers. + SmallVector<LiveBundle, 8> LiveBundles; + + // The edge bundle analysis provides indices into the LiveBundles vector. + EdgeBundles *Bundles; + + // Return a bitmask of FP registers in block's live-in list. + unsigned calcLiveInMask(MachineBasicBlock *MBB) { + unsigned Mask = 0; + for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(), + E = MBB->livein_end(); I != E; ++I) { + unsigned Reg = *I - X86::FP0; + if (Reg < 8) + Mask |= 1 << Reg; + } + return Mask; + } + + // Partition all the CFG edges into LiveBundles. + void bundleCFG(MachineFunction &MF); + + MachineBasicBlock *MBB; // Current basic block + + // The hardware keeps track of how many FP registers are live, so we have + // to model that exactly. Usually, each live register corresponds to an + // FP<n> register, but when dealing with calls, returns, and inline + // assembly, it is sometimes neccesary to have live scratch registers. + unsigned Stack[8]; // FP<n> Registers in each stack slot... + unsigned StackTop; // The current top of the FP stack. + + enum { + NumFPRegs = 16 // Including scratch pseudo-registers. + }; + + // For each live FP<n> register, point to its Stack[] entry. + // The first entries correspond to FP0-FP6, the rest are scratch registers + // used when we need slightly different live registers than what the + // register allocator thinks. + unsigned RegMap[NumFPRegs]; + + // Pending fixed registers - Inline assembly needs FP registers to appear + // in fixed stack slot positions. This is handled by copying FP registers + // to ST registers before the instruction, and copying back after the + // instruction. + // + // This is modeled with pending ST registers. NumPendingSTs is the number + // of ST registers (ST0-STn) we are tracking. PendingST[n] points to an FP + // register that holds the ST value. The ST registers are not moved into + // place until immediately before the instruction that needs them. + // + // It can happen that we need an ST register to be live when no FP register + // holds the value: + // + // %ST0 = COPY %FP4<kill> + // + // When that happens, we allocate a scratch FP register to hold the ST + // value. That means every register in PendingST must be live. + + unsigned NumPendingSTs; + unsigned char PendingST[8]; + + // Set up our stack model to match the incoming registers to MBB. + void setupBlockStack(); + + // Shuffle live registers to match the expectations of successor blocks. + void finishBlockStack(); + + void dumpStack() const { + dbgs() << "Stack contents:"; + for (unsigned i = 0; i != StackTop; ++i) { + dbgs() << " FP" << Stack[i]; + assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!"); + } + for (unsigned i = 0; i != NumPendingSTs; ++i) + dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]); + dbgs() << "\n"; + } + + /// getSlot - Return the stack slot number a particular register number is + /// in. + unsigned getSlot(unsigned RegNo) const { + assert(RegNo < NumFPRegs && "Regno out of range!"); + return RegMap[RegNo]; + } + + /// isLive - Is RegNo currently live in the stack? + bool isLive(unsigned RegNo) const { + unsigned Slot = getSlot(RegNo); + return Slot < StackTop && Stack[Slot] == RegNo; + } + + /// getScratchReg - Return an FP register that is not currently in use. + unsigned getScratchReg() { + for (int i = NumFPRegs - 1; i >= 8; --i) + if (!isLive(i)) + return i; + llvm_unreachable("Ran out of scratch FP registers"); + } + + /// isScratchReg - Returns trus if RegNo is a scratch FP register. + bool isScratchReg(unsigned RegNo) { + return RegNo > 8 && RegNo < NumFPRegs; + } + + /// getStackEntry - Return the X86::FP<n> register in register ST(i). + unsigned getStackEntry(unsigned STi) const { + if (STi >= StackTop) + report_fatal_error("Access past stack top!"); + return Stack[StackTop-1-STi]; + } + + /// getSTReg - Return the X86::ST(i) register which contains the specified + /// FP<RegNo> register. + unsigned getSTReg(unsigned RegNo) const { + return StackTop - 1 - getSlot(RegNo) + llvm::X86::ST0; + } + + // pushReg - Push the specified FP<n> register onto the stack. + void pushReg(unsigned Reg) { + assert(Reg < NumFPRegs && "Register number out of range!"); + if (StackTop >= 8) + report_fatal_error("Stack overflow!"); + Stack[StackTop] = Reg; + RegMap[Reg] = StackTop++; + } + + bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; } + void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) { + DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc(); + if (isAtTop(RegNo)) return; + + unsigned STReg = getSTReg(RegNo); + unsigned RegOnTop = getStackEntry(0); + + // Swap the slots the regs are in. + std::swap(RegMap[RegNo], RegMap[RegOnTop]); + + // Swap stack slot contents. + if (RegMap[RegOnTop] >= StackTop) + report_fatal_error("Access past stack top!"); + std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]); + + // Emit an fxch to update the runtime processors version of the state. + BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg); + ++NumFXCH; + } + + void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) { + DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc(); + unsigned STReg = getSTReg(RegNo); + pushReg(AsReg); // New register on top of stack + + BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg); + } + + /// duplicatePendingSTBeforeKill - The instruction at I is about to kill + /// RegNo. If any PendingST registers still need the RegNo value, duplicate + /// them to new scratch registers. + void duplicatePendingSTBeforeKill(unsigned RegNo, MachineInstr *I) { + for (unsigned i = 0; i != NumPendingSTs; ++i) { + if (PendingST[i] != RegNo) + continue; + unsigned SR = getScratchReg(); + DEBUG(dbgs() << "Duplicating pending ST" << i + << " in FP" << RegNo << " to FP" << SR << '\n'); + duplicateToTop(RegNo, SR, I); + PendingST[i] = SR; + } + } + + /// popStackAfter - Pop the current value off of the top of the FP stack + /// after the specified instruction. + void popStackAfter(MachineBasicBlock::iterator &I); + + /// freeStackSlotAfter - Free the specified register from the register + /// stack, so that it is no longer in a register. If the register is + /// currently at the top of the stack, we just pop the current instruction, + /// otherwise we store the current top-of-stack into the specified slot, + /// then pop the top of stack. + void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg); + + /// freeStackSlotBefore - Just the pop, no folding. Return the inserted + /// instruction. + MachineBasicBlock::iterator + freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo); + + /// Adjust the live registers to be the set in Mask. + void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I); + + /// Shuffle the top FixCount stack entries such that FP reg FixStack[0] is + /// st(0), FP reg FixStack[1] is st(1) etc. + void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount, + MachineBasicBlock::iterator I); + + bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); + + void handleZeroArgFP(MachineBasicBlock::iterator &I); + void handleOneArgFP(MachineBasicBlock::iterator &I); + void handleOneArgFPRW(MachineBasicBlock::iterator &I); + void handleTwoArgFP(MachineBasicBlock::iterator &I); + void handleCompareFP(MachineBasicBlock::iterator &I); + void handleCondMovFP(MachineBasicBlock::iterator &I); + void handleSpecialFP(MachineBasicBlock::iterator &I); + + // Check if a COPY instruction is using FP registers. + bool isFPCopy(MachineInstr *MI) { + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcReg = MI->getOperand(1).getReg(); + + return X86::RFP80RegClass.contains(DstReg) || + X86::RFP80RegClass.contains(SrcReg); + } + }; + char FPS::ID = 0; +} + +FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); } + +/// getFPReg - Return the X86::FPx register number for the specified operand. +/// For example, this returns 3 for X86::FP3. +static unsigned getFPReg(const MachineOperand &MO) { + assert(MO.isReg() && "Expected an FP register!"); + unsigned Reg = MO.getReg(); + assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!"); + return Reg - X86::FP0; +} + +/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP +/// register references into FP stack references. +/// +bool FPS::runOnMachineFunction(MachineFunction &MF) { + // We only need to run this pass if there are any FP registers used in this + // function. If it is all integer, there is nothing for us to do! + bool FPIsUsed = false; + + assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!"); + for (unsigned i = 0; i <= 6; ++i) + if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) { + FPIsUsed = true; + break; + } + + // Early exit. + if (!FPIsUsed) return false; + + Bundles = &getAnalysis<EdgeBundles>(); + TII = MF.getTarget().getInstrInfo(); + + // Prepare cross-MBB liveness. + bundleCFG(MF); + + StackTop = 0; + + // Process the function in depth first order so that we process at least one + // of the predecessors for every reachable block in the function. + SmallPtrSet<MachineBasicBlock*, 8> Processed; + MachineBasicBlock *Entry = MF.begin(); + + bool Changed = false; + for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 8> > + I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed); + I != E; ++I) + Changed |= processBasicBlock(MF, **I); + + // Process any unreachable blocks in arbitrary order now. + if (MF.size() != Processed.size()) + for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) + if (Processed.insert(BB)) + Changed |= processBasicBlock(MF, *BB); + + LiveBundles.clear(); + + return Changed; +} + +/// bundleCFG - Scan all the basic blocks to determine consistent live-in and +/// live-out sets for the FP registers. Consistent means that the set of +/// registers live-out from a block is identical to the live-in set of all +/// successors. This is not enforced by the normal live-in lists since +/// registers may be implicitly defined, or not used by all successors. +void FPS::bundleCFG(MachineFunction &MF) { + assert(LiveBundles.empty() && "Stale data in LiveBundles"); + LiveBundles.resize(Bundles->getNumBundles()); + + // Gather the actual live-in masks for all MBBs. + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { + MachineBasicBlock *MBB = I; + const unsigned Mask = calcLiveInMask(MBB); + if (!Mask) + continue; + // Update MBB ingoing bundle mask. + LiveBundles[Bundles->getBundle(MBB->getNumber(), false)].Mask |= Mask; + } +} + +/// processBasicBlock - Loop over all of the instructions in the basic block, +/// transforming FP instructions into their stack form. +/// +bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { + bool Changed = false; + MBB = &BB; + NumPendingSTs = 0; + + setupBlockStack(); + + for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { + MachineInstr *MI = I; + uint64_t Flags = MI->getDesc().TSFlags; + + unsigned FPInstClass = Flags & X86II::FPTypeMask; + if (MI->isInlineAsm()) + FPInstClass = X86II::SpecialFP; + + if (MI->isCopy() && isFPCopy(MI)) + FPInstClass = X86II::SpecialFP; + + if (MI->isImplicitDef() && + X86::RFP80RegClass.contains(MI->getOperand(0).getReg())) + FPInstClass = X86II::SpecialFP; + + if (FPInstClass == X86II::NotFP) + continue; // Efficiently ignore non-fp insts! + + MachineInstr *PrevMI = 0; + if (I != BB.begin()) + PrevMI = prior(I); + + ++NumFP; // Keep track of # of pseudo instrs + DEBUG(dbgs() << "\nFPInst:\t" << *MI); + + // Get dead variables list now because the MI pointer may be deleted as part + // of processing! + SmallVector<unsigned, 8> DeadRegs; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.isDead()) + DeadRegs.push_back(MO.getReg()); + } + + switch (FPInstClass) { + case X86II::ZeroArgFP: handleZeroArgFP(I); break; + case X86II::OneArgFP: handleOneArgFP(I); break; // fstp ST(0) + case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0)) + case X86II::TwoArgFP: handleTwoArgFP(I); break; + case X86II::CompareFP: handleCompareFP(I); break; + case X86II::CondMovFP: handleCondMovFP(I); break; + case X86II::SpecialFP: handleSpecialFP(I); break; + default: llvm_unreachable("Unknown FP Type!"); + } + + // Check to see if any of the values defined by this instruction are dead + // after definition. If so, pop them. + for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) { + unsigned Reg = DeadRegs[i]; + if (Reg >= X86::FP0 && Reg <= X86::FP6) { + DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n"); + freeStackSlotAfter(I, Reg-X86::FP0); + } + } + + // Print out all of the instructions expanded to if -debug + DEBUG( + MachineBasicBlock::iterator PrevI(PrevMI); + if (I == PrevI) { + dbgs() << "Just deleted pseudo instruction\n"; + } else { + MachineBasicBlock::iterator Start = I; + // Rewind to first instruction newly inserted. + while (Start != BB.begin() && prior(Start) != PrevI) --Start; + dbgs() << "Inserted instructions:\n\t"; + Start->print(dbgs(), &MF.getTarget()); + while (++Start != llvm::next(I)) {} + } + dumpStack(); + ); + (void)PrevMI; + + Changed = true; + } + + finishBlockStack(); + + return Changed; +} + +/// setupBlockStack - Use the live bundles to set up our model of the stack +/// to match predecessors' live out stack. +void FPS::setupBlockStack() { + DEBUG(dbgs() << "\nSetting up live-ins for BB#" << MBB->getNumber() + << " derived from " << MBB->getName() << ".\n"); + StackTop = 0; + // Get the live-in bundle for MBB. + const LiveBundle &Bundle = + LiveBundles[Bundles->getBundle(MBB->getNumber(), false)]; + + if (!Bundle.Mask) { + DEBUG(dbgs() << "Block has no FP live-ins.\n"); + return; + } + + // Depth-first iteration should ensure that we always have an assigned stack. + assert(Bundle.isFixed() && "Reached block before any predecessors"); + + // Push the fixed live-in registers. + for (unsigned i = Bundle.FixCount; i > 0; --i) { + MBB->addLiveIn(X86::ST0+i-1); + DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP" + << unsigned(Bundle.FixStack[i-1]) << '\n'); + pushReg(Bundle.FixStack[i-1]); + } + + // Kill off unwanted live-ins. This can happen with a critical edge. + // FIXME: We could keep these live registers around as zombies. They may need + // to be revived at the end of a short block. It might save a few instrs. + adjustLiveRegs(calcLiveInMask(MBB), MBB->begin()); + DEBUG(MBB->dump()); +} + +/// finishBlockStack - Revive live-outs that are implicitly defined out of +/// MBB. Shuffle live registers to match the expected fixed stack of any +/// predecessors, and ensure that all predecessors are expecting the same +/// stack. +void FPS::finishBlockStack() { + // The RET handling below takes care of return blocks for us. + if (MBB->succ_empty()) + return; + + DEBUG(dbgs() << "Setting up live-outs for BB#" << MBB->getNumber() + << " derived from " << MBB->getName() << ".\n"); + + // Get MBB's live-out bundle. + unsigned BundleIdx = Bundles->getBundle(MBB->getNumber(), true); + LiveBundle &Bundle = LiveBundles[BundleIdx]; + + // We may need to kill and define some registers to match successors. + // FIXME: This can probably be combined with the shuffle below. + MachineBasicBlock::iterator Term = MBB->getFirstTerminator(); + adjustLiveRegs(Bundle.Mask, Term); + + if (!Bundle.Mask) { + DEBUG(dbgs() << "No live-outs.\n"); + return; + } + + // Has the stack order been fixed yet? + DEBUG(dbgs() << "LB#" << BundleIdx << ": "); + if (Bundle.isFixed()) { + DEBUG(dbgs() << "Shuffling stack to match.\n"); + shuffleStackTop(Bundle.FixStack, Bundle.FixCount, Term); + } else { + // Not fixed yet, we get to choose. + DEBUG(dbgs() << "Fixing stack order now.\n"); + Bundle.FixCount = StackTop; + for (unsigned i = 0; i < StackTop; ++i) + Bundle.FixStack[i] = getStackEntry(i); + } +} + + +//===----------------------------------------------------------------------===// +// Efficient Lookup Table Support +//===----------------------------------------------------------------------===// + +namespace { + struct TableEntry { + unsigned from; + unsigned to; + bool operator<(const TableEntry &TE) const { return from < TE.from; } + friend bool operator<(const TableEntry &TE, unsigned V) { + return TE.from < V; + } + friend bool LLVM_ATTRIBUTE_USED operator<(unsigned V, + const TableEntry &TE) { + return V < TE.from; + } + }; +} + +#ifndef NDEBUG +static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) { + for (unsigned i = 0; i != NumEntries-1; ++i) + if (!(Table[i] < Table[i+1])) return false; + return true; +} +#endif + +static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) { + const TableEntry *I = std::lower_bound(Table, Table+N, Opcode); + if (I != Table+N && I->from == Opcode) + return I->to; + return -1; +} + +#ifdef NDEBUG +#define ASSERT_SORTED(TABLE) +#else +#define ASSERT_SORTED(TABLE) \ + { static bool TABLE##Checked = false; \ + if (!TABLE##Checked) { \ + assert(TableIsSorted(TABLE, array_lengthof(TABLE)) && \ + "All lookup tables must be sorted for efficient access!"); \ + TABLE##Checked = true; \ + } \ + } +#endif + +//===----------------------------------------------------------------------===// +// Register File -> Register Stack Mapping Methods +//===----------------------------------------------------------------------===// + +// OpcodeTable - Sorted map of register instructions to their stack version. +// The first element is an register file pseudo instruction, the second is the +// concrete X86 instruction which uses the register stack. +// +static const TableEntry OpcodeTable[] = { + { X86::ABS_Fp32 , X86::ABS_F }, + { X86::ABS_Fp64 , X86::ABS_F }, + { X86::ABS_Fp80 , X86::ABS_F }, + { X86::ADD_Fp32m , X86::ADD_F32m }, + { X86::ADD_Fp64m , X86::ADD_F64m }, + { X86::ADD_Fp64m32 , X86::ADD_F32m }, + { X86::ADD_Fp80m32 , X86::ADD_F32m }, + { X86::ADD_Fp80m64 , X86::ADD_F64m }, + { X86::ADD_FpI16m32 , X86::ADD_FI16m }, + { X86::ADD_FpI16m64 , X86::ADD_FI16m }, + { X86::ADD_FpI16m80 , X86::ADD_FI16m }, + { X86::ADD_FpI32m32 , X86::ADD_FI32m }, + { X86::ADD_FpI32m64 , X86::ADD_FI32m }, + { X86::ADD_FpI32m80 , X86::ADD_FI32m }, + { X86::CHS_Fp32 , X86::CHS_F }, + { X86::CHS_Fp64 , X86::CHS_F }, + { X86::CHS_Fp80 , X86::CHS_F }, + { X86::CMOVBE_Fp32 , X86::CMOVBE_F }, + { X86::CMOVBE_Fp64 , X86::CMOVBE_F }, + { X86::CMOVBE_Fp80 , X86::CMOVBE_F }, + { X86::CMOVB_Fp32 , X86::CMOVB_F }, + { X86::CMOVB_Fp64 , X86::CMOVB_F }, + { X86::CMOVB_Fp80 , X86::CMOVB_F }, + { X86::CMOVE_Fp32 , X86::CMOVE_F }, + { X86::CMOVE_Fp64 , X86::CMOVE_F }, + { X86::CMOVE_Fp80 , X86::CMOVE_F }, + { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F }, + { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F }, + { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F }, + { X86::CMOVNB_Fp32 , X86::CMOVNB_F }, + { X86::CMOVNB_Fp64 , X86::CMOVNB_F }, + { X86::CMOVNB_Fp80 , X86::CMOVNB_F }, + { X86::CMOVNE_Fp32 , X86::CMOVNE_F }, + { X86::CMOVNE_Fp64 , X86::CMOVNE_F }, + { X86::CMOVNE_Fp80 , X86::CMOVNE_F }, + { X86::CMOVNP_Fp32 , X86::CMOVNP_F }, + { X86::CMOVNP_Fp64 , X86::CMOVNP_F }, + { X86::CMOVNP_Fp80 , X86::CMOVNP_F }, + { X86::CMOVP_Fp32 , X86::CMOVP_F }, + { X86::CMOVP_Fp64 , X86::CMOVP_F }, + { X86::CMOVP_Fp80 , X86::CMOVP_F }, + { X86::COS_Fp32 , X86::COS_F }, + { X86::COS_Fp64 , X86::COS_F }, + { X86::COS_Fp80 , X86::COS_F }, + { X86::DIVR_Fp32m , X86::DIVR_F32m }, + { X86::DIVR_Fp64m , X86::DIVR_F64m }, + { X86::DIVR_Fp64m32 , X86::DIVR_F32m }, + { X86::DIVR_Fp80m32 , X86::DIVR_F32m }, + { X86::DIVR_Fp80m64 , X86::DIVR_F64m }, + { X86::DIVR_FpI16m32, X86::DIVR_FI16m}, + { X86::DIVR_FpI16m64, X86::DIVR_FI16m}, + { X86::DIVR_FpI16m80, X86::DIVR_FI16m}, + { X86::DIVR_FpI32m32, X86::DIVR_FI32m}, + { X86::DIVR_FpI32m64, X86::DIVR_FI32m}, + { X86::DIVR_FpI32m80, X86::DIVR_FI32m}, + { X86::DIV_Fp32m , X86::DIV_F32m }, + { X86::DIV_Fp64m , X86::DIV_F64m }, + { X86::DIV_Fp64m32 , X86::DIV_F32m }, + { X86::DIV_Fp80m32 , X86::DIV_F32m }, + { X86::DIV_Fp80m64 , X86::DIV_F64m }, + { X86::DIV_FpI16m32 , X86::DIV_FI16m }, + { X86::DIV_FpI16m64 , X86::DIV_FI16m }, + { X86::DIV_FpI16m80 , X86::DIV_FI16m }, + { X86::DIV_FpI32m32 , X86::DIV_FI32m }, + { X86::DIV_FpI32m64 , X86::DIV_FI32m }, + { X86::DIV_FpI32m80 , X86::DIV_FI32m }, + { X86::ILD_Fp16m32 , X86::ILD_F16m }, + { X86::ILD_Fp16m64 , X86::ILD_F16m }, + { X86::ILD_Fp16m80 , X86::ILD_F16m }, + { X86::ILD_Fp32m32 , X86::ILD_F32m }, + { X86::ILD_Fp32m64 , X86::ILD_F32m }, + { X86::ILD_Fp32m80 , X86::ILD_F32m }, + { X86::ILD_Fp64m32 , X86::ILD_F64m }, + { X86::ILD_Fp64m64 , X86::ILD_F64m }, + { X86::ILD_Fp64m80 , X86::ILD_F64m }, + { X86::ISTT_Fp16m32 , X86::ISTT_FP16m}, + { X86::ISTT_Fp16m64 , X86::ISTT_FP16m}, + { X86::ISTT_Fp16m80 , X86::ISTT_FP16m}, + { X86::ISTT_Fp32m32 , X86::ISTT_FP32m}, + { X86::ISTT_Fp32m64 , X86::ISTT_FP32m}, + { X86::ISTT_Fp32m80 , X86::ISTT_FP32m}, + { X86::ISTT_Fp64m32 , X86::ISTT_FP64m}, + { X86::ISTT_Fp64m64 , X86::ISTT_FP64m}, + { X86::ISTT_Fp64m80 , X86::ISTT_FP64m}, + { X86::IST_Fp16m32 , X86::IST_F16m }, + { X86::IST_Fp16m64 , X86::IST_F16m }, + { X86::IST_Fp16m80 , X86::IST_F16m }, + { X86::IST_Fp32m32 , X86::IST_F32m }, + { X86::IST_Fp32m64 , X86::IST_F32m }, + { X86::IST_Fp32m80 , X86::IST_F32m }, + { X86::IST_Fp64m32 , X86::IST_FP64m }, + { X86::IST_Fp64m64 , X86::IST_FP64m }, + { X86::IST_Fp64m80 , X86::IST_FP64m }, + { X86::LD_Fp032 , X86::LD_F0 }, + { X86::LD_Fp064 , X86::LD_F0 }, + { X86::LD_Fp080 , X86::LD_F0 }, + { X86::LD_Fp132 , X86::LD_F1 }, + { X86::LD_Fp164 , X86::LD_F1 }, + { X86::LD_Fp180 , X86::LD_F1 }, + { X86::LD_Fp32m , X86::LD_F32m }, + { X86::LD_Fp32m64 , X86::LD_F32m }, + { X86::LD_Fp32m80 , X86::LD_F32m }, + { X86::LD_Fp64m , X86::LD_F64m }, + { X86::LD_Fp64m80 , X86::LD_F64m }, + { X86::LD_Fp80m , X86::LD_F80m }, + { X86::MUL_Fp32m , X86::MUL_F32m }, + { X86::MUL_Fp64m , X86::MUL_F64m }, + { X86::MUL_Fp64m32 , X86::MUL_F32m }, + { X86::MUL_Fp80m32 , X86::MUL_F32m }, + { X86::MUL_Fp80m64 , X86::MUL_F64m }, + { X86::MUL_FpI16m32 , X86::MUL_FI16m }, + { X86::MUL_FpI16m64 , X86::MUL_FI16m }, + { X86::MUL_FpI16m80 , X86::MUL_FI16m }, + { X86::MUL_FpI32m32 , X86::MUL_FI32m }, + { X86::MUL_FpI32m64 , X86::MUL_FI32m }, + { X86::MUL_FpI32m80 , X86::MUL_FI32m }, + { X86::SIN_Fp32 , X86::SIN_F }, + { X86::SIN_Fp64 , X86::SIN_F }, + { X86::SIN_Fp80 , X86::SIN_F }, + { X86::SQRT_Fp32 , X86::SQRT_F }, + { X86::SQRT_Fp64 , X86::SQRT_F }, + { X86::SQRT_Fp80 , X86::SQRT_F }, + { X86::ST_Fp32m , X86::ST_F32m }, + { X86::ST_Fp64m , X86::ST_F64m }, + { X86::ST_Fp64m32 , X86::ST_F32m }, + { X86::ST_Fp80m32 , X86::ST_F32m }, + { X86::ST_Fp80m64 , X86::ST_F64m }, + { X86::ST_FpP80m , X86::ST_FP80m }, + { X86::SUBR_Fp32m , X86::SUBR_F32m }, + { X86::SUBR_Fp64m , X86::SUBR_F64m }, + { X86::SUBR_Fp64m32 , X86::SUBR_F32m }, + { X86::SUBR_Fp80m32 , X86::SUBR_F32m }, + { X86::SUBR_Fp80m64 , X86::SUBR_F64m }, + { X86::SUBR_FpI16m32, X86::SUBR_FI16m}, + { X86::SUBR_FpI16m64, X86::SUBR_FI16m}, + { X86::SUBR_FpI16m80, X86::SUBR_FI16m}, + { X86::SUBR_FpI32m32, X86::SUBR_FI32m}, + { X86::SUBR_FpI32m64, X86::SUBR_FI32m}, + { X86::SUBR_FpI32m80, X86::SUBR_FI32m}, + { X86::SUB_Fp32m , X86::SUB_F32m }, + { X86::SUB_Fp64m , X86::SUB_F64m }, + { X86::SUB_Fp64m32 , X86::SUB_F32m }, + { X86::SUB_Fp80m32 , X86::SUB_F32m }, + { X86::SUB_Fp80m64 , X86::SUB_F64m }, + { X86::SUB_FpI16m32 , X86::SUB_FI16m }, + { X86::SUB_FpI16m64 , X86::SUB_FI16m }, + { X86::SUB_FpI16m80 , X86::SUB_FI16m }, + { X86::SUB_FpI32m32 , X86::SUB_FI32m }, + { X86::SUB_FpI32m64 , X86::SUB_FI32m }, + { X86::SUB_FpI32m80 , X86::SUB_FI32m }, + { X86::TST_Fp32 , X86::TST_F }, + { X86::TST_Fp64 , X86::TST_F }, + { X86::TST_Fp80 , X86::TST_F }, + { X86::UCOM_FpIr32 , X86::UCOM_FIr }, + { X86::UCOM_FpIr64 , X86::UCOM_FIr }, + { X86::UCOM_FpIr80 , X86::UCOM_FIr }, + { X86::UCOM_Fpr32 , X86::UCOM_Fr }, + { X86::UCOM_Fpr64 , X86::UCOM_Fr }, + { X86::UCOM_Fpr80 , X86::UCOM_Fr }, +}; + +static unsigned getConcreteOpcode(unsigned Opcode) { + ASSERT_SORTED(OpcodeTable); + int Opc = Lookup(OpcodeTable, array_lengthof(OpcodeTable), Opcode); + assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!"); + return Opc; +} + +//===----------------------------------------------------------------------===// +// Helper Methods +//===----------------------------------------------------------------------===// + +// PopTable - Sorted map of instructions to their popping version. The first +// element is an instruction, the second is the version which pops. +// +static const TableEntry PopTable[] = { + { X86::ADD_FrST0 , X86::ADD_FPrST0 }, + + { X86::DIVR_FrST0, X86::DIVR_FPrST0 }, + { X86::DIV_FrST0 , X86::DIV_FPrST0 }, + + { X86::IST_F16m , X86::IST_FP16m }, + { X86::IST_F32m , X86::IST_FP32m }, + + { X86::MUL_FrST0 , X86::MUL_FPrST0 }, + + { X86::ST_F32m , X86::ST_FP32m }, + { X86::ST_F64m , X86::ST_FP64m }, + { X86::ST_Frr , X86::ST_FPrr }, + + { X86::SUBR_FrST0, X86::SUBR_FPrST0 }, + { X86::SUB_FrST0 , X86::SUB_FPrST0 }, + + { X86::UCOM_FIr , X86::UCOM_FIPr }, + + { X86::UCOM_FPr , X86::UCOM_FPPr }, + { X86::UCOM_Fr , X86::UCOM_FPr }, +}; + +/// popStackAfter - Pop the current value off of the top of the FP stack after +/// the specified instruction. This attempts to be sneaky and combine the pop +/// into the instruction itself if possible. The iterator is left pointing to +/// the last instruction, be it a new pop instruction inserted, or the old +/// instruction if it was modified in place. +/// +void FPS::popStackAfter(MachineBasicBlock::iterator &I) { + MachineInstr* MI = I; + DebugLoc dl = MI->getDebugLoc(); + ASSERT_SORTED(PopTable); + if (StackTop == 0) + report_fatal_error("Cannot pop empty stack!"); + RegMap[Stack[--StackTop]] = ~0; // Update state + + // Check to see if there is a popping version of this instruction... + int Opcode = Lookup(PopTable, array_lengthof(PopTable), I->getOpcode()); + if (Opcode != -1) { + I->setDesc(TII->get(Opcode)); + if (Opcode == X86::UCOM_FPPr) + I->RemoveOperand(0); + } else { // Insert an explicit pop + I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0); + } +} + +/// freeStackSlotAfter - Free the specified register from the register stack, so +/// that it is no longer in a register. If the register is currently at the top +/// of the stack, we just pop the current instruction, otherwise we store the +/// current top-of-stack into the specified slot, then pop the top of stack. +void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) { + if (getStackEntry(0) == FPRegNo) { // already at the top of stack? easy. + popStackAfter(I); + return; + } + + // Otherwise, store the top of stack into the dead slot, killing the operand + // without having to add in an explicit xchg then pop. + // + I = freeStackSlotBefore(++I, FPRegNo); +} + +/// freeStackSlotBefore - Free the specified register without trying any +/// folding. +MachineBasicBlock::iterator +FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) { + unsigned STReg = getSTReg(FPRegNo); + unsigned OldSlot = getSlot(FPRegNo); + unsigned TopReg = Stack[StackTop-1]; + Stack[OldSlot] = TopReg; + RegMap[TopReg] = OldSlot; + RegMap[FPRegNo] = ~0; + Stack[--StackTop] = ~0; + return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr)).addReg(STReg); +} + +/// adjustLiveRegs - Kill and revive registers such that exactly the FP +/// registers with a bit in Mask are live. +void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { + unsigned Defs = Mask; + unsigned Kills = 0; + for (unsigned i = 0; i < StackTop; ++i) { + unsigned RegNo = Stack[i]; + if (!(Defs & (1 << RegNo))) + // This register is live, but we don't want it. + Kills |= (1 << RegNo); + else + // We don't need to imp-def this live register. + Defs &= ~(1 << RegNo); + } + assert((Kills & Defs) == 0 && "Register needs killing and def'ing?"); + + // Produce implicit-defs for free by using killed registers. + while (Kills && Defs) { + unsigned KReg = CountTrailingZeros_32(Kills); + unsigned DReg = CountTrailingZeros_32(Defs); + DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n"); + std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]); + std::swap(RegMap[KReg], RegMap[DReg]); + Kills &= ~(1 << KReg); + Defs &= ~(1 << DReg); + } + + // Kill registers by popping. + if (Kills && I != MBB->begin()) { + MachineBasicBlock::iterator I2 = llvm::prior(I); + while (StackTop) { + unsigned KReg = getStackEntry(0); + if (!(Kills & (1 << KReg))) + break; + DEBUG(dbgs() << "Popping %FP" << KReg << "\n"); + popStackAfter(I2); + Kills &= ~(1 << KReg); + } + } + + // Manually kill the rest. + while (Kills) { + unsigned KReg = CountTrailingZeros_32(Kills); + DEBUG(dbgs() << "Killing %FP" << KReg << "\n"); + freeStackSlotBefore(I, KReg); + Kills &= ~(1 << KReg); + } + + // Load zeros for all the imp-defs. + while(Defs) { + unsigned DReg = CountTrailingZeros_32(Defs); + DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n"); + BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0)); + pushReg(DReg); + Defs &= ~(1 << DReg); + } + + // Now we should have the correct registers live. + DEBUG(dumpStack()); + assert(StackTop == CountPopulation_32(Mask) && "Live count mismatch"); +} + +/// shuffleStackTop - emit fxch instructions before I to shuffle the top +/// FixCount entries into the order given by FixStack. +/// FIXME: Is there a better algorithm than insertion sort? +void FPS::shuffleStackTop(const unsigned char *FixStack, + unsigned FixCount, + MachineBasicBlock::iterator I) { + // Move items into place, starting from the desired stack bottom. + while (FixCount--) { + // Old register at position FixCount. + unsigned OldReg = getStackEntry(FixCount); + // Desired register at position FixCount. + unsigned Reg = FixStack[FixCount]; + if (Reg == OldReg) + continue; + // (Reg st0) (OldReg st0) = (Reg OldReg st0) + moveToTop(Reg, I); + if (FixCount > 0) + moveToTop(OldReg, I); + } + DEBUG(dumpStack()); +} + + +//===----------------------------------------------------------------------===// +// Instruction transformation implementation +//===----------------------------------------------------------------------===// + +/// handleZeroArgFP - ST(0) = fld0 ST(0) = flds <mem> +/// +void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + unsigned DestReg = getFPReg(MI->getOperand(0)); + + // Change from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(0); // Remove the explicit ST(0) operand + MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode()))); + + // Result gets pushed on the stack. + pushReg(DestReg); +} + +/// handleOneArgFP - fst <mem>, ST(0) +/// +void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + unsigned NumOps = MI->getDesc().getNumOperands(); + assert((NumOps == X86::AddrNumOperands + 1 || NumOps == 1) && + "Can only handle fst* & ftst instructions!"); + + // Is this the last use of the source register? + unsigned Reg = getFPReg(MI->getOperand(NumOps-1)); + bool KillsSrc = MI->killsRegister(X86::FP0+Reg); + + if (KillsSrc) + duplicatePendingSTBeforeKill(Reg, I); + + // FISTP64m is strange because there isn't a non-popping versions. + // If we have one _and_ we don't want to pop the operand, duplicate the value + // on the stack instead of moving it. This ensure that popping the value is + // always ok. + // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m. + // + if (!KillsSrc && + (MI->getOpcode() == X86::IST_Fp64m32 || + MI->getOpcode() == X86::ISTT_Fp16m32 || + MI->getOpcode() == X86::ISTT_Fp32m32 || + MI->getOpcode() == X86::ISTT_Fp64m32 || + MI->getOpcode() == X86::IST_Fp64m64 || + MI->getOpcode() == X86::ISTT_Fp16m64 || + MI->getOpcode() == X86::ISTT_Fp32m64 || + MI->getOpcode() == X86::ISTT_Fp64m64 || + MI->getOpcode() == X86::IST_Fp64m80 || + MI->getOpcode() == X86::ISTT_Fp16m80 || + MI->getOpcode() == X86::ISTT_Fp32m80 || + MI->getOpcode() == X86::ISTT_Fp64m80 || + MI->getOpcode() == X86::ST_FpP80m)) { + duplicateToTop(Reg, getScratchReg(), I); + } else { + moveToTop(Reg, I); // Move to the top of the stack... + } + + // Convert from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(NumOps-1); // Remove explicit ST(0) operand + MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode()))); + + if (MI->getOpcode() == X86::IST_FP64m || + MI->getOpcode() == X86::ISTT_FP16m || + MI->getOpcode() == X86::ISTT_FP32m || + MI->getOpcode() == X86::ISTT_FP64m || + MI->getOpcode() == X86::ST_FP80m) { + if (StackTop == 0) + report_fatal_error("Stack empty??"); + --StackTop; + } else if (KillsSrc) { // Last use of operand? + popStackAfter(I); + } +} + + +/// handleOneArgFPRW: Handle instructions that read from the top of stack and +/// replace the value with a newly computed value. These instructions may have +/// non-fp operands after their FP operands. +/// +/// Examples: +/// R1 = fchs R2 +/// R1 = fadd R2, [mem] +/// +void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; +#ifndef NDEBUG + unsigned NumOps = MI->getDesc().getNumOperands(); + assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!"); +#endif + + // Is this the last use of the source register? + unsigned Reg = getFPReg(MI->getOperand(1)); + bool KillsSrc = MI->killsRegister(X86::FP0+Reg); + + if (KillsSrc) { + duplicatePendingSTBeforeKill(Reg, I); + // If this is the last use of the source register, just make sure it's on + // the top of the stack. + moveToTop(Reg, I); + if (StackTop == 0) + report_fatal_error("Stack cannot be empty!"); + --StackTop; + pushReg(getFPReg(MI->getOperand(0))); + } else { + // If this is not the last use of the source register, _copy_ it to the top + // of the stack. + duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I); + } + + // Change from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(1); // Drop the source operand. + MI->RemoveOperand(0); // Drop the destination operand. + MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode()))); +} + + +//===----------------------------------------------------------------------===// +// Define tables of various ways to map pseudo instructions +// + +// ForwardST0Table - Map: A = B op C into: ST(0) = ST(0) op ST(i) +static const TableEntry ForwardST0Table[] = { + { X86::ADD_Fp32 , X86::ADD_FST0r }, + { X86::ADD_Fp64 , X86::ADD_FST0r }, + { X86::ADD_Fp80 , X86::ADD_FST0r }, + { X86::DIV_Fp32 , X86::DIV_FST0r }, + { X86::DIV_Fp64 , X86::DIV_FST0r }, + { X86::DIV_Fp80 , X86::DIV_FST0r }, + { X86::MUL_Fp32 , X86::MUL_FST0r }, + { X86::MUL_Fp64 , X86::MUL_FST0r }, + { X86::MUL_Fp80 , X86::MUL_FST0r }, + { X86::SUB_Fp32 , X86::SUB_FST0r }, + { X86::SUB_Fp64 , X86::SUB_FST0r }, + { X86::SUB_Fp80 , X86::SUB_FST0r }, +}; + +// ReverseST0Table - Map: A = B op C into: ST(0) = ST(i) op ST(0) +static const TableEntry ReverseST0Table[] = { + { X86::ADD_Fp32 , X86::ADD_FST0r }, // commutative + { X86::ADD_Fp64 , X86::ADD_FST0r }, // commutative + { X86::ADD_Fp80 , X86::ADD_FST0r }, // commutative + { X86::DIV_Fp32 , X86::DIVR_FST0r }, + { X86::DIV_Fp64 , X86::DIVR_FST0r }, + { X86::DIV_Fp80 , X86::DIVR_FST0r }, + { X86::MUL_Fp32 , X86::MUL_FST0r }, // commutative + { X86::MUL_Fp64 , X86::MUL_FST0r }, // commutative + { X86::MUL_Fp80 , X86::MUL_FST0r }, // commutative + { X86::SUB_Fp32 , X86::SUBR_FST0r }, + { X86::SUB_Fp64 , X86::SUBR_FST0r }, + { X86::SUB_Fp80 , X86::SUBR_FST0r }, +}; + +// ForwardSTiTable - Map: A = B op C into: ST(i) = ST(0) op ST(i) +static const TableEntry ForwardSTiTable[] = { + { X86::ADD_Fp32 , X86::ADD_FrST0 }, // commutative + { X86::ADD_Fp64 , X86::ADD_FrST0 }, // commutative + { X86::ADD_Fp80 , X86::ADD_FrST0 }, // commutative + { X86::DIV_Fp32 , X86::DIVR_FrST0 }, + { X86::DIV_Fp64 , X86::DIVR_FrST0 }, + { X86::DIV_Fp80 , X86::DIVR_FrST0 }, + { X86::MUL_Fp32 , X86::MUL_FrST0 }, // commutative + { X86::MUL_Fp64 , X86::MUL_FrST0 }, // commutative + { X86::MUL_Fp80 , X86::MUL_FrST0 }, // commutative + { X86::SUB_Fp32 , X86::SUBR_FrST0 }, + { X86::SUB_Fp64 , X86::SUBR_FrST0 }, + { X86::SUB_Fp80 , X86::SUBR_FrST0 }, +}; + +// ReverseSTiTable - Map: A = B op C into: ST(i) = ST(i) op ST(0) +static const TableEntry ReverseSTiTable[] = { + { X86::ADD_Fp32 , X86::ADD_FrST0 }, + { X86::ADD_Fp64 , X86::ADD_FrST0 }, + { X86::ADD_Fp80 , X86::ADD_FrST0 }, + { X86::DIV_Fp32 , X86::DIV_FrST0 }, + { X86::DIV_Fp64 , X86::DIV_FrST0 }, + { X86::DIV_Fp80 , X86::DIV_FrST0 }, + { X86::MUL_Fp32 , X86::MUL_FrST0 }, + { X86::MUL_Fp64 , X86::MUL_FrST0 }, + { X86::MUL_Fp80 , X86::MUL_FrST0 }, + { X86::SUB_Fp32 , X86::SUB_FrST0 }, + { X86::SUB_Fp64 , X86::SUB_FrST0 }, + { X86::SUB_Fp80 , X86::SUB_FrST0 }, +}; + + +/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual +/// instructions which need to be simplified and possibly transformed. +/// +/// Result: ST(0) = fsub ST(0), ST(i) +/// ST(i) = fsub ST(0), ST(i) +/// ST(0) = fsubr ST(0), ST(i) +/// ST(i) = fsubr ST(0), ST(i) +/// +void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { + ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table); + ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable); + MachineInstr *MI = I; + + unsigned NumOperands = MI->getDesc().getNumOperands(); + assert(NumOperands == 3 && "Illegal TwoArgFP instruction!"); + unsigned Dest = getFPReg(MI->getOperand(0)); + unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2)); + unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1)); + bool KillsOp0 = MI->killsRegister(X86::FP0+Op0); + bool KillsOp1 = MI->killsRegister(X86::FP0+Op1); + DebugLoc dl = MI->getDebugLoc(); + + unsigned TOS = getStackEntry(0); + + // One of our operands must be on the top of the stack. If neither is yet, we + // need to move one. + if (Op0 != TOS && Op1 != TOS) { // No operand at TOS? + // We can choose to move either operand to the top of the stack. If one of + // the operands is killed by this instruction, we want that one so that we + // can update right on top of the old version. + if (KillsOp0) { + moveToTop(Op0, I); // Move dead operand to TOS. + TOS = Op0; + } else if (KillsOp1) { + moveToTop(Op1, I); + TOS = Op1; + } else { + // All of the operands are live after this instruction executes, so we + // cannot update on top of any operand. Because of this, we must + // duplicate one of the stack elements to the top. It doesn't matter + // which one we pick. + // + duplicateToTop(Op0, Dest, I); + Op0 = TOS = Dest; + KillsOp0 = true; + } + } else if (!KillsOp0 && !KillsOp1) { + // If we DO have one of our operands at the top of the stack, but we don't + // have a dead operand, we must duplicate one of the operands to a new slot + // on the stack. + duplicateToTop(Op0, Dest, I); + Op0 = TOS = Dest; + KillsOp0 = true; + } + + // Now we know that one of our operands is on the top of the stack, and at + // least one of our operands is killed by this instruction. + assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) && + "Stack conditions not set up right!"); + + // We decide which form to use based on what is on the top of the stack, and + // which operand is killed by this instruction. + const TableEntry *InstTable; + bool isForward = TOS == Op0; + bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0); + if (updateST0) { + if (isForward) + InstTable = ForwardST0Table; + else + InstTable = ReverseST0Table; + } else { + if (isForward) + InstTable = ForwardSTiTable; + else + InstTable = ReverseSTiTable; + } + + int Opcode = Lookup(InstTable, array_lengthof(ForwardST0Table), + MI->getOpcode()); + assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!"); + + // NotTOS - The register which is not on the top of stack... + unsigned NotTOS = (TOS == Op0) ? Op1 : Op0; + + // Replace the old instruction with a new instruction + MBB->remove(I++); + I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS)); + + // If both operands are killed, pop one off of the stack in addition to + // overwriting the other one. + if (KillsOp0 && KillsOp1 && Op0 != Op1) { + assert(!updateST0 && "Should have updated other operand!"); + popStackAfter(I); // Pop the top of stack + } + + // Update stack information so that we know the destination register is now on + // the stack. + unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS); + assert(UpdatedSlot < StackTop && Dest < 7); + Stack[UpdatedSlot] = Dest; + RegMap[Dest] = UpdatedSlot; + MBB->getParent()->DeleteMachineInstr(MI); // Remove the old instruction +} + +/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP +/// register arguments and no explicit destinations. +/// +void FPS::handleCompareFP(MachineBasicBlock::iterator &I) { + ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table); + ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable); + MachineInstr *MI = I; + + unsigned NumOperands = MI->getDesc().getNumOperands(); + assert(NumOperands == 2 && "Illegal FUCOM* instruction!"); + unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2)); + unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1)); + bool KillsOp0 = MI->killsRegister(X86::FP0+Op0); + bool KillsOp1 = MI->killsRegister(X86::FP0+Op1); + + // Make sure the first operand is on the top of stack, the other one can be + // anywhere. + moveToTop(Op0, I); + + // Change from the pseudo instruction to the concrete instruction. + MI->getOperand(0).setReg(getSTReg(Op1)); + MI->RemoveOperand(1); + MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode()))); + + // If any of the operands are killed by this instruction, free them. + if (KillsOp0) freeStackSlotAfter(I, Op0); + if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1); +} + +/// handleCondMovFP - Handle two address conditional move instructions. These +/// instructions move a st(i) register to st(0) iff a condition is true. These +/// instructions require that the first operand is at the top of the stack, but +/// otherwise don't modify the stack at all. +void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + + unsigned Op0 = getFPReg(MI->getOperand(0)); + unsigned Op1 = getFPReg(MI->getOperand(2)); + bool KillsOp1 = MI->killsRegister(X86::FP0+Op1); + + // The first operand *must* be on the top of the stack. + moveToTop(Op0, I); + + // Change the second operand to the stack register that the operand is in. + // Change from the pseudo instruction to the concrete instruction. + MI->RemoveOperand(0); + MI->RemoveOperand(1); + MI->getOperand(0).setReg(getSTReg(Op1)); + MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode()))); + + // If we kill the second operand, make sure to pop it from the stack. + if (Op0 != Op1 && KillsOp1) { + // Get this value off of the register stack. + freeStackSlotAfter(I, Op1); + } +} + + +/// handleSpecialFP - Handle special instructions which behave unlike other +/// floating point instructions. This is primarily intended for use by pseudo +/// instructions. +/// +void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { + MachineInstr *MI = I; + switch (MI->getOpcode()) { + default: llvm_unreachable("Unknown SpecialFP instruction!"); + case TargetOpcode::COPY: { + // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP. + const MachineOperand &MO1 = MI->getOperand(1); + const MachineOperand &MO0 = MI->getOperand(0); + unsigned DstST = MO0.getReg() - X86::ST0; + unsigned SrcST = MO1.getReg() - X86::ST0; + bool KillsSrc = MI->killsRegister(MO1.getReg()); + + // ST = COPY FP. Set up a pending ST register. + if (DstST < 8) { + unsigned SrcFP = getFPReg(MO1); + assert(isLive(SrcFP) && "Cannot copy dead register"); + assert(!MO0.isDead() && "Cannot copy to dead ST register"); + + // Unallocated STs are marked as the nonexistent FP255. + while (NumPendingSTs <= DstST) + PendingST[NumPendingSTs++] = NumFPRegs; + + // STi could still be live from a previous inline asm. + if (isScratchReg(PendingST[DstST])) { + DEBUG(dbgs() << "Clobbering old ST in FP" << unsigned(PendingST[DstST]) + << '\n'); + freeStackSlotBefore(MI, PendingST[DstST]); + } + + // When the source is killed, allocate a scratch FP register. + if (KillsSrc) { + duplicatePendingSTBeforeKill(SrcFP, I); + unsigned Slot = getSlot(SrcFP); + unsigned SR = getScratchReg(); + PendingST[DstST] = SR; + Stack[Slot] = SR; + RegMap[SR] = Slot; + } else + PendingST[DstST] = SrcFP; + break; + } + + // FP = COPY ST. Extract fixed stack value. + // Any instruction defining ST registers must have assigned them to a + // scratch register. + if (SrcST < 8) { + unsigned DstFP = getFPReg(MO0); + assert(!isLive(DstFP) && "Cannot copy ST to live FP register"); + assert(NumPendingSTs > SrcST && "Cannot copy from dead ST register"); + unsigned SrcFP = PendingST[SrcST]; + assert(isScratchReg(SrcFP) && "Expected ST in a scratch register"); + assert(isLive(SrcFP) && "Scratch holding ST is dead"); + + // DstFP steals the stack slot from SrcFP. + unsigned Slot = getSlot(SrcFP); + Stack[Slot] = DstFP; + RegMap[DstFP] = Slot; + + // Always treat the ST as killed. + PendingST[SrcST] = NumFPRegs; + while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs) + --NumPendingSTs; + break; + } + + // FP <- FP copy. + unsigned DstFP = getFPReg(MO0); + unsigned SrcFP = getFPReg(MO1); + assert(isLive(SrcFP) && "Cannot copy dead register"); + if (KillsSrc) { + // If the input operand is killed, we can just change the owner of the + // incoming stack slot into the result. + unsigned Slot = getSlot(SrcFP); + Stack[Slot] = DstFP; + RegMap[DstFP] = Slot; + } else { + // For COPY we just duplicate the specified value to a new stack slot. + // This could be made better, but would require substantial changes. + duplicateToTop(SrcFP, DstFP, I); + } + break; + } + + case TargetOpcode::IMPLICIT_DEF: { + // All FP registers must be explicitly defined, so load a 0 instead. + unsigned Reg = MI->getOperand(0).getReg() - X86::FP0; + DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n'); + BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0)); + pushReg(Reg); + break; + } + + case X86::FpPOP_RETVAL: { + // The FpPOP_RETVAL instruction is used after calls that return a value on + // the floating point stack. We cannot model this with ST defs since CALL + // instructions have fixed clobber lists. This instruction is interpreted + // to mean that there is one more live register on the stack than we + // thought. + // + // This means that StackTop does not match the hardware stack between a + // call and the FpPOP_RETVAL instructions. We do tolerate FP instructions + // between CALL and FpPOP_RETVAL as long as they don't overflow the + // hardware stack. + unsigned DstFP = getFPReg(MI->getOperand(0)); + + // Move existing stack elements up to reflect reality. + assert(StackTop < 8 && "Stack overflowed before FpPOP_RETVAL"); + if (StackTop) { + std::copy_backward(Stack, Stack + StackTop, Stack + StackTop + 1); + for (unsigned i = 0; i != NumFPRegs; ++i) + ++RegMap[i]; + } + ++StackTop; + + // DstFP is the new bottom of the stack. + Stack[0] = DstFP; + RegMap[DstFP] = 0; + + // DstFP will be killed by processBasicBlock if this was a dead def. + break; + } + + case TargetOpcode::INLINEASM: { + // The inline asm MachineInstr currently only *uses* FP registers for the + // 'f' constraint. These should be turned into the current ST(x) register + // in the machine instr. + // + // There are special rules for x87 inline assembly. The compiler must know + // exactly how many registers are popped and pushed implicitly by the asm. + // Otherwise it is not possible to restore the stack state after the inline + // asm. + // + // There are 3 kinds of input operands: + // + // 1. Popped inputs. These must appear at the stack top in ST0-STn. A + // popped input operand must be in a fixed stack slot, and it is either + // tied to an output operand, or in the clobber list. The MI has ST use + // and def operands for these inputs. + // + // 2. Fixed inputs. These inputs appear in fixed stack slots, but are + // preserved by the inline asm. The fixed stack slots must be STn-STm + // following the popped inputs. A fixed input operand cannot be tied to + // an output or appear in the clobber list. The MI has ST use operands + // and no defs for these inputs. + // + // 3. Preserved inputs. These inputs use the "f" constraint which is + // represented as an FP register. The inline asm won't change these + // stack slots. + // + // Outputs must be in ST registers, FP outputs are not allowed. Clobbered + // registers do not count as output operands. The inline asm changes the + // stack as if it popped all the popped inputs and then pushed all the + // output operands. + + // Scan the assembly for ST registers used, defined and clobbered. We can + // only tell clobbers from defs by looking at the asm descriptor. + unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0; + unsigned NumOps = 0; + for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands(); + i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) { + unsigned Flags = MI->getOperand(i).getImm(); + NumOps = InlineAsm::getNumOperandRegisters(Flags); + if (NumOps != 1) + continue; + const MachineOperand &MO = MI->getOperand(i + 1); + if (!MO.isReg()) + continue; + unsigned STReg = MO.getReg() - X86::ST0; + if (STReg >= 8) + continue; + + switch (InlineAsm::getKind(Flags)) { + case InlineAsm::Kind_RegUse: + STUses |= (1u << STReg); + break; + case InlineAsm::Kind_RegDef: + case InlineAsm::Kind_RegDefEarlyClobber: + STDefs |= (1u << STReg); + if (MO.isDead()) + STDeadDefs |= (1u << STReg); + break; + case InlineAsm::Kind_Clobber: + STClobbers |= (1u << STReg); + break; + default: + break; + } + } + + if (STUses && !isMask_32(STUses)) + MI->emitError("fixed input regs must be last on the x87 stack"); + unsigned NumSTUses = CountTrailingOnes_32(STUses); + + // Defs must be contiguous from the stack top. ST0-STn. + if (STDefs && !isMask_32(STDefs)) { + MI->emitError("output regs must be last on the x87 stack"); + STDefs = NextPowerOf2(STDefs) - 1; + } + unsigned NumSTDefs = CountTrailingOnes_32(STDefs); + + // So must the clobbered stack slots. ST0-STm, m >= n. + if (STClobbers && !isMask_32(STDefs | STClobbers)) + MI->emitError("clobbers must be last on the x87 stack"); + + // Popped inputs are the ones that are also clobbered or defined. + unsigned STPopped = STUses & (STDefs | STClobbers); + if (STPopped && !isMask_32(STPopped)) + MI->emitError("implicitly popped regs must be last on the x87 stack"); + unsigned NumSTPopped = CountTrailingOnes_32(STPopped); + + DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops " + << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n"); + + // Scan the instruction for FP uses corresponding to "f" constraints. + // Collect FP registers to kill afer the instruction. + // Always kill all the scratch regs. + unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff; + unsigned FPUsed = 0; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) + continue; + if (!Op.isUse()) + MI->emitError("illegal \"f\" output constraint"); + unsigned FPReg = getFPReg(Op); + FPUsed |= 1U << FPReg; + + // If we kill this operand, make sure to pop it from the stack after the + // asm. We just remember it for now, and pop them all off at the end in + // a batch. + if (Op.isKill()) + FPKills |= 1U << FPReg; + } + + // The popped inputs will be killed by the instruction, so duplicate them + // if the FP register needs to be live after the instruction, or if it is + // used in the instruction itself. We effectively treat the popped inputs + // as early clobbers. + for (unsigned i = 0; i < NumSTPopped; ++i) { + if ((FPKills & ~FPUsed) & (1u << PendingST[i])) + continue; + unsigned SR = getScratchReg(); + duplicateToTop(PendingST[i], SR, I); + DEBUG(dbgs() << "Duplicating ST" << i << " in FP" + << unsigned(PendingST[i]) << " to avoid clobbering it.\n"); + PendingST[i] = SR; + } + + // Make sure we have a unique live register for every fixed use. Some of + // them could be undef uses, and we need to emit LD_F0 instructions. + for (unsigned i = 0; i < NumSTUses; ++i) { + if (i < NumPendingSTs && PendingST[i] < NumFPRegs) { + // Check for shared assignments. + for (unsigned j = 0; j < i; ++j) { + if (PendingST[j] != PendingST[i]) + continue; + // STi and STj are inn the same register, create a copy. + unsigned SR = getScratchReg(); + duplicateToTop(PendingST[i], SR, I); + DEBUG(dbgs() << "Duplicating ST" << i << " in FP" + << unsigned(PendingST[i]) + << " to avoid collision with ST" << j << '\n'); + PendingST[i] = SR; + } + continue; + } + unsigned SR = getScratchReg(); + DEBUG(dbgs() << "Emitting LD_F0 for ST" << i << " in FP" << SR << '\n'); + BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0)); + pushReg(SR); + PendingST[i] = SR; + if (NumPendingSTs == i) + ++NumPendingSTs; + } + assert(NumPendingSTs >= NumSTUses && "Fixed registers should be assigned"); + + // Now we can rearrange the live registers to match what was requested. + shuffleStackTop(PendingST, NumPendingSTs, I); + DEBUG({dbgs() << "Before asm: "; dumpStack();}); + + // With the stack layout fixed, rewrite the FP registers. + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) + continue; + unsigned FPReg = getFPReg(Op); + Op.setReg(getSTReg(FPReg)); + } + + // Simulate the inline asm popping its inputs and pushing its outputs. + StackTop -= NumSTPopped; + + // Hold the fixed output registers in scratch FP registers. They will be + // transferred to real FP registers by copies. + NumPendingSTs = 0; + for (unsigned i = 0; i < NumSTDefs; ++i) { + unsigned SR = getScratchReg(); + pushReg(SR); + FPKills &= ~(1u << SR); + } + for (unsigned i = 0; i < NumSTDefs; ++i) + PendingST[NumPendingSTs++] = getStackEntry(i); + DEBUG({dbgs() << "After asm: "; dumpStack();}); + + // If any of the ST defs were dead, pop them immediately. Our caller only + // handles dead FP defs. + MachineBasicBlock::iterator InsertPt = MI; + for (unsigned i = 0; STDefs & (1u << i); ++i) { + if (!(STDeadDefs & (1u << i))) + continue; + freeStackSlotAfter(InsertPt, PendingST[i]); + PendingST[i] = NumFPRegs; + } + while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs) + --NumPendingSTs; + + // If this asm kills any FP registers (is the last use of them) we must + // explicitly emit pop instructions for them. Do this now after the asm has + // executed so that the ST(x) numbers are not off (which would happen if we + // did this inline with operand rewriting). + // + // Note: this might be a non-optimal pop sequence. We might be able to do + // better by trying to pop in stack order or something. + while (FPKills) { + unsigned FPReg = CountTrailingZeros_32(FPKills); + if (isLive(FPReg)) + freeStackSlotAfter(InsertPt, FPReg); + FPKills &= ~(1U << FPReg); + } + // Don't delete the inline asm! + return; + } + + case X86::RET: + case X86::RETI: + // If RET has an FP register use operand, pass the first one in ST(0) and + // the second one in ST(1). + + // Find the register operands. + unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U; + unsigned LiveMask = 0; + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) + continue; + // FP Register uses must be kills unless there are two uses of the same + // register, in which case only one will be a kill. + assert(Op.isUse() && + (Op.isKill() || // Marked kill. + getFPReg(Op) == FirstFPRegOp || // Second instance. + MI->killsRegister(Op.getReg())) && // Later use is marked kill. + "Ret only defs operands, and values aren't live beyond it"); + + if (FirstFPRegOp == ~0U) + FirstFPRegOp = getFPReg(Op); + else { + assert(SecondFPRegOp == ~0U && "More than two fp operands!"); + SecondFPRegOp = getFPReg(Op); + } + LiveMask |= (1 << getFPReg(Op)); + + // Remove the operand so that later passes don't see it. + MI->RemoveOperand(i); + --i, --e; + } + + // We may have been carrying spurious live-ins, so make sure only the returned + // registers are left live. + adjustLiveRegs(LiveMask, MI); + if (!LiveMask) return; // Quick check to see if any are possible. + + // There are only four possibilities here: + // 1) we are returning a single FP value. In this case, it has to be in + // ST(0) already, so just declare success by removing the value from the + // FP Stack. + if (SecondFPRegOp == ~0U) { + // Assert that the top of stack contains the right FP register. + assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) && + "Top of stack not the right register for RET!"); + + // Ok, everything is good, mark the value as not being on the stack + // anymore so that our assertion about the stack being empty at end of + // block doesn't fire. + StackTop = 0; + return; + } + + // Otherwise, we are returning two values: + // 2) If returning the same value for both, we only have one thing in the FP + // stack. Consider: RET FP1, FP1 + if (StackTop == 1) { + assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&& + "Stack misconfiguration for RET!"); + + // Duplicate the TOS so that we return it twice. Just pick some other FPx + // register to hold it. + unsigned NewReg = getScratchReg(); + duplicateToTop(FirstFPRegOp, NewReg, MI); + FirstFPRegOp = NewReg; + } + + /// Okay we know we have two different FPx operands now: + assert(StackTop == 2 && "Must have two values live!"); + + /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently + /// in ST(1). In this case, emit an fxch. + if (getStackEntry(0) == SecondFPRegOp) { + assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live"); + moveToTop(FirstFPRegOp, MI); + } + + /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in + /// ST(1). Just remove both from our understanding of the stack and return. + assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live"); + assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live"); + StackTop = 0; + return; + } + + I = MBB->erase(I); // Remove the pseudo instruction + + // We want to leave I pointing to the previous instruction, but what if we + // just erased the first instruction? + if (I == MBB->begin()) { + DEBUG(dbgs() << "Inserting dummy KILL\n"); + I = BuildMI(*MBB, I, DebugLoc(), TII->get(TargetOpcode::KILL)); + } else + --I; +} diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp new file mode 100644 index 0000000..d54f4ae --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -0,0 +1,1447 @@ +//=======- X86FrameLowering.cpp - X86 Frame Information --------*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#include "X86FrameLowering.h" +#include "X86InstrBuilder.h" +#include "X86InstrInfo.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/Function.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/ADT/SmallSet.h" + +using namespace llvm; + +// FIXME: completely move here. +extern cl::opt<bool> ForceStackAlign; + +bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { + return !MF.getFrameInfo()->hasVarSizedObjects(); +} + +/// hasFP - Return true if the specified function should have a dedicated frame +/// pointer register. This is true if the function has variable sized allocas +/// or if frame pointer elimination is disabled. +bool X86FrameLowering::hasFP(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineModuleInfo &MMI = MF.getMMI(); + const TargetRegisterInfo *RI = TM.getRegisterInfo(); + + return (DisableFramePointerElim(MF) || + RI->needsStackRealignment(MF) || + MFI->hasVarSizedObjects() || + MFI->isFrameAddressTaken() || + MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || + MMI.callsUnwindInit()); +} + +static unsigned getSUBriOpcode(unsigned is64Bit, int64_t Imm) { + if (is64Bit) { + if (isInt<8>(Imm)) + return X86::SUB64ri8; + return X86::SUB64ri32; + } else { + if (isInt<8>(Imm)) + return X86::SUB32ri8; + return X86::SUB32ri; + } +} + +static unsigned getADDriOpcode(unsigned is64Bit, int64_t Imm) { + if (is64Bit) { + if (isInt<8>(Imm)) + return X86::ADD64ri8; + return X86::ADD64ri32; + } else { + if (isInt<8>(Imm)) + return X86::ADD32ri8; + return X86::ADD32ri; + } +} + +/// findDeadCallerSavedReg - Return a caller-saved register that isn't live +/// when it reaches the "return" instruction. We can then pop a stack object +/// to this register without worry about clobbering it. +static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const TargetRegisterInfo &TRI, + bool Is64Bit) { + const MachineFunction *MF = MBB.getParent(); + const Function *F = MF->getFunction(); + if (!F || MF->getMMI().callsEHReturn()) + return 0; + + static const unsigned CallerSavedRegs32Bit[] = { + X86::EAX, X86::EDX, X86::ECX, 0 + }; + + static const unsigned CallerSavedRegs64Bit[] = { + X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI, + X86::R8, X86::R9, X86::R10, X86::R11, 0 + }; + + unsigned Opc = MBBI->getOpcode(); + switch (Opc) { + default: return 0; + case X86::RET: + case X86::RETI: + case X86::TCRETURNdi: + case X86::TCRETURNri: + case X86::TCRETURNmi: + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: + case X86::EH_RETURN: + case X86::EH_RETURN64: { + SmallSet<unsigned, 8> Uses; + for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MBBI->getOperand(i); + if (!MO.isReg() || MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + for (const unsigned *AsI = TRI.getOverlaps(Reg); *AsI; ++AsI) + Uses.insert(*AsI); + } + + const unsigned *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit; + for (; *CS; ++CS) + if (!Uses.count(*CS)) + return *CS; + } + } + + return 0; +} + + +/// emitSPUpdate - Emit a series of instructions to increment / decrement the +/// stack pointer by a constant value. +static +void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + unsigned StackPtr, int64_t NumBytes, + bool Is64Bit, const TargetInstrInfo &TII, + const TargetRegisterInfo &TRI) { + bool isSub = NumBytes < 0; + uint64_t Offset = isSub ? -NumBytes : NumBytes; + unsigned Opc = isSub ? + getSUBriOpcode(Is64Bit, Offset) : + getADDriOpcode(Is64Bit, Offset); + uint64_t Chunk = (1LL << 31) - 1; + DebugLoc DL = MBB.findDebugLoc(MBBI); + + while (Offset) { + uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset; + if (ThisVal == (Is64Bit ? 8 : 4)) { + // Use push / pop instead. + unsigned Reg = isSub + ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) + : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); + if (Reg) { + Opc = isSub + ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) + : (Is64Bit ? X86::POP64r : X86::POP32r); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc)) + .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); + if (isSub) + MI->setFlag(MachineInstr::FrameSetup); + Offset -= ThisVal; + continue; + } + } + + MachineInstr *MI = + BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(ThisVal); + if (isSub) + MI->setFlag(MachineInstr::FrameSetup); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + Offset -= ThisVal; + } +} + +/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator. +static +void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, + unsigned StackPtr, uint64_t *NumBytes = NULL) { + if (MBBI == MBB.begin()) return; + + MachineBasicBlock::iterator PI = prior(MBBI); + unsigned Opc = PI->getOpcode(); + if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || + Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && + PI->getOperand(0).getReg() == StackPtr) { + if (NumBytes) + *NumBytes += PI->getOperand(2).getImm(); + MBB.erase(PI); + } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || + Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && + PI->getOperand(0).getReg() == StackPtr) { + if (NumBytes) + *NumBytes -= PI->getOperand(2).getImm(); + MBB.erase(PI); + } +} + +/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower iterator. +static +void mergeSPUpdatesDown(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned StackPtr, uint64_t *NumBytes = NULL) { + // FIXME: THIS ISN'T RUN!!! + return; + + if (MBBI == MBB.end()) return; + + MachineBasicBlock::iterator NI = llvm::next(MBBI); + if (NI == MBB.end()) return; + + unsigned Opc = NI->getOpcode(); + if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || + Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && + NI->getOperand(0).getReg() == StackPtr) { + if (NumBytes) + *NumBytes -= NI->getOperand(2).getImm(); + MBB.erase(NI); + MBBI = NI; + } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || + Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && + NI->getOperand(0).getReg() == StackPtr) { + if (NumBytes) + *NumBytes += NI->getOperand(2).getImm(); + MBB.erase(NI); + MBBI = NI; + } +} + +/// mergeSPUpdates - Checks the instruction before/after the passed +/// instruction. If it is an ADD/SUB instruction it is deleted argument and the +/// stack adjustment is returned as a positive value for ADD and a negative for +/// SUB. +static int mergeSPUpdates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + unsigned StackPtr, + bool doMergeWithPrevious) { + if ((doMergeWithPrevious && MBBI == MBB.begin()) || + (!doMergeWithPrevious && MBBI == MBB.end())) + return 0; + + MachineBasicBlock::iterator PI = doMergeWithPrevious ? prior(MBBI) : MBBI; + MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : llvm::next(MBBI); + unsigned Opc = PI->getOpcode(); + int Offset = 0; + + if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || + Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && + PI->getOperand(0).getReg() == StackPtr){ + Offset += PI->getOperand(2).getImm(); + MBB.erase(PI); + if (!doMergeWithPrevious) MBBI = NI; + } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || + Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && + PI->getOperand(0).getReg() == StackPtr) { + Offset -= PI->getOperand(2).getImm(); + MBB.erase(PI); + if (!doMergeWithPrevious) MBBI = NI; + } + + return Offset; +} + +static bool isEAXLiveIn(MachineFunction &MF) { + for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), + EE = MF.getRegInfo().livein_end(); II != EE; ++II) { + unsigned Reg = II->first; + + if (Reg == X86::EAX || Reg == X86::AX || + Reg == X86::AH || Reg == X86::AL) + return true; + } + + return false; +} + +void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, + MCSymbol *Label, + unsigned FramePtr) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + + // Add callee saved registers to move list. + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + if (CSI.empty()) return; + + std::vector<MachineMove> &Moves = MMI.getFrameMoves(); + const TargetData *TD = TM.getTargetData(); + bool HasFP = hasFP(MF); + + // Calculate amount of bytes used for return address storing. + int stackGrowth = -TD->getPointerSize(); + + // FIXME: This is dirty hack. The code itself is pretty mess right now. + // It should be rewritten from scratch and generalized sometimes. + + // Determine maximum offset (minimum due to stack growth). + int64_t MaxOffset = 0; + for (std::vector<CalleeSavedInfo>::const_iterator + I = CSI.begin(), E = CSI.end(); I != E; ++I) + MaxOffset = std::min(MaxOffset, + MFI->getObjectOffset(I->getFrameIdx())); + + // Calculate offsets. + int64_t saveAreaOffset = (HasFP ? 3 : 2) * stackGrowth; + for (std::vector<CalleeSavedInfo>::const_iterator + I = CSI.begin(), E = CSI.end(); I != E; ++I) { + int64_t Offset = MFI->getObjectOffset(I->getFrameIdx()); + unsigned Reg = I->getReg(); + Offset = MaxOffset - Offset + saveAreaOffset; + + // Don't output a new machine move if we're re-saving the frame + // pointer. This happens when the PrologEpilogInserter has inserted an extra + // "PUSH" of the frame pointer -- the "emitPrologue" method automatically + // generates one when frame pointers are used. If we generate a "machine + // move" for this extra "PUSH", the linker will lose track of the fact that + // the frame pointer should have the value of the first "PUSH" when it's + // trying to unwind. + // + // FIXME: This looks inelegant. It's possibly correct, but it's covering up + // another bug. I.e., one where we generate a prolog like this: + // + // pushl %ebp + // movl %esp, %ebp + // pushl %ebp + // pushl %esi + // ... + // + // The immediate re-push of EBP is unnecessary. At the least, it's an + // optimization bug. EBP can be used as a scratch register in certain + // cases, but probably not when we have a frame pointer. + if (HasFP && FramePtr == Reg) + continue; + + MachineLocation CSDst(MachineLocation::VirtualFP, Offset); + MachineLocation CSSrc(Reg); + Moves.push_back(MachineMove(Label, CSDst, CSSrc)); + } +} + +/// getCompactUnwindRegNum - Get the compact unwind number for a given +/// register. The number corresponds to the enum lists in +/// compact_unwind_encoding.h. +static int getCompactUnwindRegNum(const unsigned *CURegs, unsigned Reg) { + int Idx = 1; + for (; *CURegs; ++CURegs, ++Idx) + if (*CURegs == Reg) + return Idx; + + return -1; +} + +/// encodeCompactUnwindRegistersWithoutFrame - Create the permutation encoding +/// used with frameless stacks. It is passed the number of registers to be saved +/// and an array of the registers saved. +static uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[6], + unsigned RegCount, + bool Is64Bit) { + // The saved registers are numbered from 1 to 6. In order to encode the order + // in which they were saved, we re-number them according to their place in the + // register order. The re-numbering is relative to the last re-numbered + // register. E.g., if we have registers {6, 2, 4, 5} saved in that order: + // + // Orig Re-Num + // ---- ------ + // 6 6 + // 2 2 + // 4 3 + // 5 3 + // + static const unsigned CU32BitRegs[] = { + X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 + }; + static const unsigned CU64BitRegs[] = { + X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 + }; + const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs); + + uint32_t RenumRegs[6]; + for (unsigned i = 6 - RegCount; i < 6; ++i) { + int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]); + if (CUReg == -1) return ~0U; + SavedRegs[i] = CUReg; + + unsigned Countless = 0; + for (unsigned j = 6 - RegCount; j < i; ++j) + if (SavedRegs[j] < SavedRegs[i]) + ++Countless; + + RenumRegs[i] = SavedRegs[i] - Countless - 1; + } + + // Take the renumbered values and encode them into a 10-bit number. + uint32_t permutationEncoding = 0; + switch (RegCount) { + case 6: + permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1] + + 6 * RenumRegs[2] + 2 * RenumRegs[3] + + RenumRegs[4]; + break; + case 5: + permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2] + + 6 * RenumRegs[3] + 2 * RenumRegs[4] + + RenumRegs[5]; + break; + case 4: + permutationEncoding |= 60 * RenumRegs[2] + 12 * RenumRegs[3] + + 3 * RenumRegs[4] + RenumRegs[5]; + break; + case 3: + permutationEncoding |= 20 * RenumRegs[3] + 4 * RenumRegs[4] + + RenumRegs[5]; + break; + case 2: + permutationEncoding |= 5 * RenumRegs[4] + RenumRegs[5]; + break; + case 1: + permutationEncoding |= RenumRegs[5]; + break; + } + + assert((permutationEncoding & 0x3FF) == permutationEncoding && + "Invalid compact register encoding!"); + return permutationEncoding; +} + +/// encodeCompactUnwindRegistersWithFrame - Return the registers encoded for a +/// compact encoding with a frame pointer. +static uint32_t encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[6], + bool Is64Bit) { + static const unsigned CU32BitRegs[] = { + X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 + }; + static const unsigned CU64BitRegs[] = { + X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 + }; + const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs); + + // Encode the registers in the order they were saved, 3-bits per register. The + // registers are numbered from 1 to 6. + uint32_t RegEnc = 0; + for (int I = 5; I >= 0; --I) { + unsigned Reg = SavedRegs[I]; + if (Reg == 0) break; + int CURegNum = getCompactUnwindRegNum(CURegs, Reg); + if (CURegNum == -1) + return ~0U; + RegEnc |= (CURegNum & 0x7) << (5 - I); + } + + assert((RegEnc & 0x7FFF) == RegEnc && "Invalid compact register encoding!"); + return RegEnc; +} + +uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { + const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned StackPtr = RegInfo->getStackRegister(); + + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + + bool Is64Bit = STI.is64Bit(); + bool HasFP = hasFP(MF); + + unsigned SavedRegs[6] = { 0, 0, 0, 0, 0, 0 }; + int SavedRegIdx = 6; + + unsigned OffsetSize = (Is64Bit ? 8 : 4); + + unsigned PushInstr = (Is64Bit ? X86::PUSH64r : X86::PUSH32r); + unsigned PushInstrSize = 1; + unsigned MoveInstr = (Is64Bit ? X86::MOV64rr : X86::MOV32rr); + unsigned MoveInstrSize = (Is64Bit ? 3 : 2); + unsigned SubtractInstr = getSUBriOpcode(Is64Bit, -TailCallReturnAddrDelta); + unsigned SubtractInstrIdx = (Is64Bit ? 3 : 2); + + unsigned StackDivide = (Is64Bit ? 8 : 4); + + unsigned InstrOffset = 0; + unsigned CFAOffset = 0; + unsigned StackAdjust = 0; + + MachineBasicBlock &MBB = MF.front(); // Prologue is in entry BB. + bool ExpectEnd = false; + for (MachineBasicBlock::iterator + MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE; ++MBBI) { + MachineInstr &MI = *MBBI; + unsigned Opc = MI.getOpcode(); + if (Opc == X86::PROLOG_LABEL) continue; + if (!MI.getFlag(MachineInstr::FrameSetup)) break; + + // We don't exect any more prolog instructions. + if (ExpectEnd) return 0; + + if (Opc == PushInstr) { + // If there are too many saved registers, we cannot use compact encoding. + if (--SavedRegIdx < 0) return 0; + + SavedRegs[SavedRegIdx] = MI.getOperand(0).getReg(); + CFAOffset += OffsetSize; + InstrOffset += PushInstrSize; + } else if (Opc == MoveInstr) { + unsigned SrcReg = MI.getOperand(1).getReg(); + unsigned DstReg = MI.getOperand(0).getReg(); + + if (DstReg != FramePtr || SrcReg != StackPtr) + return 0; + + CFAOffset = 0; + memset(SavedRegs, 0, sizeof(SavedRegs)); + InstrOffset += MoveInstrSize; + } else if (Opc == SubtractInstr) { + if (StackAdjust) + // We all ready have a stack pointer adjustment. + return 0; + + if (!MI.getOperand(0).isReg() || + MI.getOperand(0).getReg() != MI.getOperand(1).getReg() || + MI.getOperand(0).getReg() != StackPtr || !MI.getOperand(2).isImm()) + // We need this to be a stack adjustment pointer. Something like: + // + // %RSP<def> = SUB64ri8 %RSP, 48 + return 0; + + StackAdjust = MI.getOperand(2).getImm() / StackDivide; + SubtractInstrIdx += InstrOffset; + ExpectEnd = true; + } + } + + // Encode that we are using EBP/RBP as the frame pointer. + uint32_t CompactUnwindEncoding = 0; + CFAOffset /= StackDivide; + if (HasFP) { + if ((CFAOffset & 0xFF) != CFAOffset) + // Offset was too big for compact encoding. + return 0; + + // Get the encoding of the saved registers when we have a frame pointer. + uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(SavedRegs, Is64Bit); + if (RegEnc == ~0U) + return 0; + + CompactUnwindEncoding |= 0x01000000; + CompactUnwindEncoding |= (CFAOffset & 0xFF) << 16; + CompactUnwindEncoding |= RegEnc & 0x7FFF; + } else { + unsigned FullOffset = CFAOffset + StackAdjust; + if ((FullOffset & 0xFF) == FullOffset) { + // Frameless stack. + CompactUnwindEncoding |= 0x02000000; + CompactUnwindEncoding |= (FullOffset & 0xFF) << 16; + } else { + if ((CFAOffset & 0x7) != CFAOffset) + // The extra stack adjustments are too big for us to handle. + return 0; + + // Frameless stack with an offset too large for us to encode compactly. + CompactUnwindEncoding |= 0x03000000; + + // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP' + // instruction. + CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16; + + // Encode any extra stack stack changes (done via push instructions). + CompactUnwindEncoding |= (CFAOffset & 0x7) << 13; + } + + // Get the encoding of the saved registers when we don't have a frame + // pointer. + uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegs, + 6 - SavedRegIdx, + Is64Bit); + if (RegEnc == ~0U) return 0; + CompactUnwindEncoding |= RegEnc & 0x3FF; + } + + return CompactUnwindEncoding; +} + +/// emitPrologue - Push callee-saved registers onto the stack, which +/// automatically adjust the stack pointer. Adjust the stack pointer to allocate +/// space for local variables. Also emit labels used by the exception handler to +/// generate the exception handling frames. +void X86FrameLowering::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB. + MachineBasicBlock::iterator MBBI = MBB.begin(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const Function *Fn = MF.getFunction(); + const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); + const X86InstrInfo &TII = *TM.getInstrInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + bool needsFrameMoves = MMI.hasDebugInfo() || + Fn->needsUnwindTableEntry(); + uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. + uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. + bool HasFP = hasFP(MF); + bool Is64Bit = STI.is64Bit(); + bool IsWin64 = STI.isTargetWin64(); + unsigned StackAlign = getStackAlignment(); + unsigned SlotSize = RegInfo->getSlotSize(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned StackPtr = RegInfo->getStackRegister(); + DebugLoc DL; + + // If we're forcing a stack realignment we can't rely on just the frame + // info, we need to know the ABI stack alignment as well in case we + // have a call out. Otherwise just make sure we have some alignment - we'll + // go with the minimum SlotSize. + if (ForceStackAlign) { + if (MFI->hasCalls()) + MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; + else if (MaxAlign < SlotSize) + MaxAlign = SlotSize; + } + + // Add RETADDR move area to callee saved frame size. + int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + if (TailCallReturnAddrDelta < 0) + X86FI->setCalleeSavedFrameSize( + X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); + + // If this is x86-64 and the Red Zone is not disabled, if we are a leaf + // function, and use up to 128 bytes of stack space, don't have a frame + // pointer, calls, or dynamic alloca then we do not need to adjust the + // stack pointer (we fit in the Red Zone). + if (Is64Bit && !Fn->hasFnAttr(Attribute::NoRedZone) && + !RegInfo->needsStackRealignment(MF) && + !MFI->hasVarSizedObjects() && // No dynamic alloca. + !MFI->adjustsStack() && // No calls. + !IsWin64 && // Win64 has no Red Zone + !EnableSegmentedStacks) { // Regular stack + uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); + if (HasFP) MinSize += SlotSize; + StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); + MFI->setStackSize(StackSize); + } + + // Insert stack pointer adjustment for later moving of return addr. Only + // applies to tail call optimized functions where the callee argument stack + // size is bigger than the callers. + if (TailCallReturnAddrDelta < 0) { + MachineInstr *MI = + BuildMI(MBB, MBBI, DL, + TII.get(getSUBriOpcode(Is64Bit, -TailCallReturnAddrDelta)), + StackPtr) + .addReg(StackPtr) + .addImm(-TailCallReturnAddrDelta) + .setMIFlag(MachineInstr::FrameSetup); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + } + + // Mapping for machine moves: + // + // DST: VirtualFP AND + // SRC: VirtualFP => DW_CFA_def_cfa_offset + // ELSE => DW_CFA_def_cfa + // + // SRC: VirtualFP AND + // DST: Register => DW_CFA_def_cfa_register + // + // ELSE + // OFFSET < 0 => DW_CFA_offset_extended_sf + // REG < 64 => DW_CFA_offset + Reg + // ELSE => DW_CFA_offset_extended + + std::vector<MachineMove> &Moves = MMI.getFrameMoves(); + const TargetData *TD = MF.getTarget().getTargetData(); + uint64_t NumBytes = 0; + int stackGrowth = -TD->getPointerSize(); + + if (HasFP) { + // Calculate required stack adjustment. + uint64_t FrameSize = StackSize - SlotSize; + if (RegInfo->needsStackRealignment(MF)) + FrameSize = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; + + NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); + + // Get the offset of the stack slot for the EBP register, which is + // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. + // Update the frame offset adjustment. + MFI->setOffsetAdjustment(-NumBytes); + + // Save EBP/RBP into the appropriate stack slot. + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) + .addReg(FramePtr, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + + if (needsFrameMoves) { + // Mark the place where EBP/RBP was saved. + MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol(); + BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)) + .addSym(FrameLabel); + + // Define the current CFA rule to use the provided offset. + if (StackSize) { + MachineLocation SPDst(MachineLocation::VirtualFP); + MachineLocation SPSrc(MachineLocation::VirtualFP, 2 * stackGrowth); + Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc)); + } else { + MachineLocation SPDst(StackPtr); + MachineLocation SPSrc(StackPtr, stackGrowth); + Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc)); + } + + // Change the rule for the FramePtr to be an "offset" rule. + MachineLocation FPDst(MachineLocation::VirtualFP, 2 * stackGrowth); + MachineLocation FPSrc(FramePtr); + Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc)); + } + + // Update EBP with the new base value. + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + + if (needsFrameMoves) { + // Mark effective beginning of when frame pointer becomes valid. + MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol(); + BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)) + .addSym(FrameLabel); + + // Define the current CFA to use the EBP/RBP register. + MachineLocation FPDst(FramePtr); + MachineLocation FPSrc(MachineLocation::VirtualFP); + Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc)); + } + + // Mark the FramePtr as live-in in every block except the entry. + for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end(); + I != E; ++I) + I->addLiveIn(FramePtr); + + // Realign stack + if (RegInfo->needsStackRealignment(MF)) { + MachineInstr *MI = + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr) + .addReg(StackPtr) + .addImm(-MaxAlign) + .setMIFlag(MachineInstr::FrameSetup); + + // The EFLAGS implicit def is dead. + MI->getOperand(3).setIsDead(); + } + } else { + NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); + } + + // Skip the callee-saved push instructions. + bool PushedRegs = false; + int StackOffset = 2 * stackGrowth; + + while (MBBI != MBB.end() && + (MBBI->getOpcode() == X86::PUSH32r || + MBBI->getOpcode() == X86::PUSH64r)) { + PushedRegs = true; + MBBI->setFlag(MachineInstr::FrameSetup); + ++MBBI; + + if (!HasFP && needsFrameMoves) { + // Mark callee-saved push instruction. + MCSymbol *Label = MMI.getContext().CreateTempSymbol(); + BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label); + + // Define the current CFA rule to use the provided offset. + unsigned Ptr = StackSize ? MachineLocation::VirtualFP : StackPtr; + MachineLocation SPDst(Ptr); + MachineLocation SPSrc(Ptr, StackOffset); + Moves.push_back(MachineMove(Label, SPDst, SPSrc)); + StackOffset += stackGrowth; + } + } + + DL = MBB.findDebugLoc(MBBI); + + // If there is an SUB32ri of ESP immediately before this instruction, merge + // the two. This can be the case when tail call elimination is enabled and + // the callee has more arguments then the caller. + NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true); + + // If there is an ADD32ri or SUB32ri of ESP immediately after this + // instruction, merge the two instructions. + mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes); + + // Adjust stack pointer: ESP -= numbytes. + + // Windows and cygwin/mingw require a prologue helper routine when allocating + // more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw + // uses __alloca. __alloca and the 32-bit version of __chkstk will probe the + // stack and adjust the stack pointer in one go. The 64-bit version of + // __chkstk is only responsible for probing the stack. The 64-bit prologue is + // responsible for adjusting the stack pointer. Touching the stack at 4K + // increments is necessary to ensure that the guard pages used by the OS + // virtual memory manager are allocated in correct sequence. + if (NumBytes >= 4096 && STI.isTargetCOFF() && !STI.isTargetEnvMacho()) { + const char *StackProbeSymbol; + bool isSPUpdateNeeded = false; + + if (Is64Bit) { + if (STI.isTargetCygMing()) + StackProbeSymbol = "___chkstk"; + else { + StackProbeSymbol = "__chkstk"; + isSPUpdateNeeded = true; + } + } else if (STI.isTargetCygMing()) + StackProbeSymbol = "_alloca"; + else + StackProbeSymbol = "_chkstk"; + + // Check whether EAX is livein for this function. + bool isEAXAlive = isEAXLiveIn(MF); + + if (isEAXAlive) { + // Sanity check that EAX is not livein for this function. + // It should not be, so throw an assert. + assert(!Is64Bit && "EAX is livein in x64 case!"); + + // Save EAX + BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) + .addReg(X86::EAX, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + } + + if (Is64Bit) { + // Handle the 64-bit Windows ABI case where we need to call __chkstk. + // Function prologue is responsible for adjusting the stack pointer. + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) + .addImm(NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } else { + // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. + // We'll also use 4 already allocated bytes for EAX. + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) + .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) + .setMIFlag(MachineInstr::FrameSetup); + } + + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::W64ALLOCA : X86::CALLpcrel32)) + .addExternalSymbol(StackProbeSymbol) + .addReg(StackPtr, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit) + .setMIFlag(MachineInstr::FrameSetup); + + // MSVC x64's __chkstk needs to adjust %rsp. + // FIXME: %rax preserves the offset and should be available. + if (isSPUpdateNeeded) + emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, + TII, *RegInfo); + + if (isEAXAlive) { + // Restore EAX + MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), + X86::EAX), + StackPtr, false, NumBytes - 4); + MI->setFlag(MachineInstr::FrameSetup); + MBB.insert(MBBI, MI); + } + } else if (NumBytes) + emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, + TII, *RegInfo); + + if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) { + // Mark end of stack pointer adjustment. + MCSymbol *Label = MMI.getContext().CreateTempSymbol(); + BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)) + .addSym(Label); + + if (!HasFP && NumBytes) { + // Define the current CFA rule to use the provided offset. + if (StackSize) { + MachineLocation SPDst(MachineLocation::VirtualFP); + MachineLocation SPSrc(MachineLocation::VirtualFP, + -StackSize + stackGrowth); + Moves.push_back(MachineMove(Label, SPDst, SPSrc)); + } else { + MachineLocation SPDst(StackPtr); + MachineLocation SPSrc(StackPtr, stackGrowth); + Moves.push_back(MachineMove(Label, SPDst, SPSrc)); + } + } + + // Emit DWARF info specifying the offsets of the callee-saved registers. + if (PushedRegs) + emitCalleeSavedFrameMoves(MF, Label, HasFP ? FramePtr : StackPtr); + } + + // Darwin 10.7 and greater has support for compact unwind encoding. + if (STI.getTargetTriple().isMacOSX() && + !STI.getTargetTriple().isMacOSXVersionLT(10, 7)) + MMI.setCompactUnwindEncoding(getCompactUnwindEncoding(MF)); +} + +void X86FrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); + const X86InstrInfo &TII = *TM.getInstrInfo(); + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI != MBB.end() && "Returning block has no instructions"); + unsigned RetOpcode = MBBI->getOpcode(); + DebugLoc DL = MBBI->getDebugLoc(); + bool Is64Bit = STI.is64Bit(); + unsigned StackAlign = getStackAlignment(); + unsigned SlotSize = RegInfo->getSlotSize(); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned StackPtr = RegInfo->getStackRegister(); + + switch (RetOpcode) { + default: + llvm_unreachable("Can only insert epilog into returning blocks"); + case X86::RET: + case X86::RETI: + case X86::TCRETURNdi: + case X86::TCRETURNri: + case X86::TCRETURNmi: + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: + case X86::EH_RETURN: + case X86::EH_RETURN64: + break; // These are ok + } + + // Get the number of bytes to allocate from the FrameInfo. + uint64_t StackSize = MFI->getStackSize(); + uint64_t MaxAlign = MFI->getMaxAlignment(); + unsigned CSSize = X86FI->getCalleeSavedFrameSize(); + uint64_t NumBytes = 0; + + // If we're forcing a stack realignment we can't rely on just the frame + // info, we need to know the ABI stack alignment as well in case we + // have a call out. Otherwise just make sure we have some alignment - we'll + // go with the minimum. + if (ForceStackAlign) { + if (MFI->hasCalls()) + MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; + else + MaxAlign = MaxAlign ? MaxAlign : 4; + } + + if (hasFP(MF)) { + // Calculate required stack adjustment. + uint64_t FrameSize = StackSize - SlotSize; + if (RegInfo->needsStackRealignment(MF)) + FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign; + + NumBytes = FrameSize - CSSize; + + // Pop EBP. + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr); + } else { + NumBytes = StackSize - CSSize; + } + + // Skip the callee-saved pop instructions. + MachineBasicBlock::iterator LastCSPop = MBBI; + while (MBBI != MBB.begin()) { + MachineBasicBlock::iterator PI = prior(MBBI); + unsigned Opc = PI->getOpcode(); + + if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE && + !PI->getDesc().isTerminator()) + break; + + --MBBI; + } + + DL = MBBI->getDebugLoc(); + + // If there is an ADD32ri or SUB32ri of ESP immediately before this + // instruction, merge the two instructions. + if (NumBytes || MFI->hasVarSizedObjects()) + mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes); + + // If dynamic alloca is used, then reset esp to point to the last callee-saved + // slot before popping them off! Same applies for the case, when stack was + // realigned. + if (RegInfo->needsStackRealignment(MF)) { + // We cannot use LEA here, because stack pointer was realigned. We need to + // deallocate local frame back. + if (CSSize) { + emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII, *RegInfo); + MBBI = prior(LastCSPop); + } + + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), + StackPtr).addReg(FramePtr); + } else if (MFI->hasVarSizedObjects()) { + if (CSSize) { + unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r; + MachineInstr *MI = + addRegOffset(BuildMI(MF, DL, TII.get(Opc), StackPtr), + FramePtr, false, -CSSize); + MBB.insert(MBBI, MI); + } else { + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), StackPtr) + .addReg(FramePtr); + } + } else if (NumBytes) { + // Adjust stack pointer back: ESP += numbytes. + emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII, *RegInfo); + } + + // We're returning from function via eh_return. + if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &DestAddr = MBBI->getOperand(0); + assert(DestAddr.isReg() && "Offset should be in register!"); + BuildMI(MBB, MBBI, DL, + TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), + StackPtr).addReg(DestAddr.getReg()); + } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi || + RetOpcode == X86::TCRETURNmi || + RetOpcode == X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64 || + RetOpcode == X86::TCRETURNmi64) { + bool isMem = RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64; + // Tail call return: adjust the stack pointer and jump to callee. + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1); + assert(StackAdjust.isImm() && "Expecting immediate value."); + + // Adjust stack pointer. + int StackAdj = StackAdjust.getImm(); + int MaxTCDelta = X86FI->getTCReturnAddrDelta(); + int Offset = 0; + assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive"); + + // Incoporate the retaddr area. + Offset = StackAdj-MaxTCDelta; + assert(Offset >= 0 && "Offset should never be negative"); + + if (Offset) { + // Check for possible merge with preceding ADD instruction. + Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); + emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII, *RegInfo); + } + + // Jump to label or value in register. + if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) { + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNdi) + ? X86::TAILJMPd : X86::TAILJMPd64)); + if (JumpTarget.isGlobal()) + MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), + JumpTarget.getTargetFlags()); + else { + assert(JumpTarget.isSymbol()); + MIB.addExternalSymbol(JumpTarget.getSymbolName(), + JumpTarget.getTargetFlags()); + } + } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) { + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNmi) + ? X86::TAILJMPm : X86::TAILJMPm64)); + for (unsigned i = 0; i != 5; ++i) + MIB.addOperand(MBBI->getOperand(i)); + } else if (RetOpcode == X86::TCRETURNri64) { + BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64)). + addReg(JumpTarget.getReg(), RegState::Kill); + } else { + BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)). + addReg(JumpTarget.getReg(), RegState::Kill); + } + + MachineInstr *NewMI = prior(MBBI); + for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i) + NewMI->addOperand(MBBI->getOperand(i)); + + // Delete the pseudo instruction TCRETURN. + MBB.erase(MBBI); + } else if ((RetOpcode == X86::RET || RetOpcode == X86::RETI) && + (X86FI->getTCReturnAddrDelta() < 0)) { + // Add the return addr area delta back since we are not tail calling. + int delta = -1*X86FI->getTCReturnAddrDelta(); + MBBI = MBB.getLastNonDebugInstr(); + + // Check for possible merge with preceding ADD instruction. + delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); + emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII, *RegInfo); + } +} + +int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const { + const X86RegisterInfo *RI = + static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo()); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); + uint64_t StackSize = MFI->getStackSize(); + + if (RI->needsStackRealignment(MF)) { + if (FI < 0) { + // Skip the saved EBP. + Offset += RI->getSlotSize(); + } else { + assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); + return Offset + StackSize; + } + // FIXME: Support tail calls + } else { + if (!hasFP(MF)) + return Offset + StackSize; + + // Skip the saved EBP. + Offset += RI->getSlotSize(); + + // Skip the RETADDR move area + const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + if (TailCallReturnAddrDelta < 0) + Offset -= TailCallReturnAddrDelta; + } + + return Offset; +} + +bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + DebugLoc DL = MBB.findDebugLoc(MI); + + MachineFunction &MF = *MBB.getParent(); + + unsigned SlotSize = STI.is64Bit() ? 8 : 4; + unsigned FPReg = TRI->getFrameRegister(MF); + unsigned CalleeFrameSize = 0; + + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + + // Push GPRs. It increases frame size. + unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (!X86::GR64RegClass.contains(Reg) && + !X86::GR32RegClass.contains(Reg)) + continue; + // Add the callee-saved register as live-in. It's killed at the spill. + MBB.addLiveIn(Reg); + if (Reg == FPReg) + // X86RegisterInfo::emitPrologue will handle spilling of frame register. + continue; + CalleeFrameSize += SlotSize; + BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill) + .setMIFlag(MachineInstr::FrameSetup); + } + + X86FI->setCalleeSavedFrameSize(CalleeFrameSize); + + // Make XMM regs spilled. X86 does not have ability of push/pop XMM. + // It can be done by spilling XMMs to stack frame. + // Note that only Win64 ABI might spill XMMs. + for (unsigned i = CSI.size(); i != 0; --i) { + unsigned Reg = CSI[i-1].getReg(); + if (X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg)) + continue; + // Add the callee-saved register as live-in. It's killed at the spill. + MBB.addLiveIn(Reg); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), + RC, TRI); + } + + return true; +} + +bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return false; + + DebugLoc DL = MBB.findDebugLoc(MI); + + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + + // Reload XMMs from stack frame. + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + if (X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg)) + continue; + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), + RC, TRI); + } + + // POP GPRs. + unsigned FPReg = TRI->getFrameRegister(MF); + unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + if (!X86::GR64RegClass.contains(Reg) && + !X86::GR32RegClass.contains(Reg)) + continue; + if (Reg == FPReg) + // X86RegisterInfo::emitEpilogue will handle restoring of frame register. + continue; + BuildMI(MBB, MI, DL, TII.get(Opc), Reg); + } + return true; +} + +void +X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + const X86RegisterInfo *RegInfo = TM.getRegisterInfo(); + unsigned SlotSize = RegInfo->getSlotSize(); + + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + + if (TailCallReturnAddrDelta < 0) { + // create RETURNADDR area + // arg + // arg + // RETADDR + // { ... + // RETADDR area + // ... + // } + // [EBP] + MFI->CreateFixedObject(-TailCallReturnAddrDelta, + (-1U*SlotSize)+TailCallReturnAddrDelta, true); + } + + if (hasFP(MF)) { + assert((TailCallReturnAddrDelta <= 0) && + "The Delta should always be zero or negative"); + const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering(); + + // Create a frame entry for the EBP register that must be saved. + int FrameIdx = MFI->CreateFixedObject(SlotSize, + -(int)SlotSize + + TFI.getOffsetOfLocalArea() + + TailCallReturnAddrDelta, + true); + assert(FrameIdx == MFI->getObjectIndexBegin() && + "Slot for EBP register must be last in order to be found!"); + (void)FrameIdx; + } +} + +static bool +HasNestArgument(const MachineFunction *MF) { + const Function *F = MF->getFunction(); + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; I++) { + if (I->hasNestAttr()) + return true; + } + return false; +} + +static unsigned +GetScratchRegister(bool Is64Bit, const MachineFunction &MF) { + if (Is64Bit) { + return X86::R11; + } else { + CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv(); + bool IsNested = HasNestArgument(&MF); + + if (CallingConvention == CallingConv::X86_FastCall) { + if (IsNested) { + report_fatal_error("Segmented stacks does not support fastcall with " + "nested function."); + return -1; + } else { + return X86::EAX; + } + } else { + if (IsNested) + return X86::EDX; + else + return X86::ECX; + } + } +} + +void +X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { + MachineBasicBlock &prologueMBB = MF.front(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const X86InstrInfo &TII = *TM.getInstrInfo(); + uint64_t StackSize; + bool Is64Bit = STI.is64Bit(); + unsigned TlsReg, TlsOffset; + DebugLoc DL; + const X86Subtarget *ST = &MF.getTarget().getSubtarget<X86Subtarget>(); + + unsigned ScratchReg = GetScratchRegister(Is64Bit, MF); + assert(!MF.getRegInfo().isLiveIn(ScratchReg) && + "Scratch register is live-in"); + + if (MF.getFunction()->isVarArg()) + report_fatal_error("Segmented stacks do not support vararg functions."); + if (!ST->isTargetLinux()) + report_fatal_error("Segmented stacks supported only on linux."); + + MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + bool IsNested = false; + + // We need to know if the function has a nest argument only in 64 bit mode. + if (Is64Bit) + IsNested = HasNestArgument(&MF); + + // The MOV R10, RAX needs to be in a different block, since the RET we emit in + // allocMBB needs to be last (terminating) instruction. + MachineBasicBlock *restoreR10MBB = NULL; + if (IsNested) + restoreR10MBB = MF.CreateMachineBasicBlock(); + + for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(), + e = prologueMBB.livein_end(); i != e; i++) { + allocMBB->addLiveIn(*i); + checkMBB->addLiveIn(*i); + + if (IsNested) + restoreR10MBB->addLiveIn(*i); + } + + if (IsNested) { + allocMBB->addLiveIn(X86::R10); + restoreR10MBB->addLiveIn(X86::RAX); + } + + if (IsNested) + MF.push_front(restoreR10MBB); + MF.push_front(allocMBB); + MF.push_front(checkMBB); + + // Eventually StackSize will be calculated by a link-time pass; which will + // also decide whether checking code needs to be injected into this particular + // prologue. + StackSize = MFI->getStackSize(); + + // Read the limit off the current stacklet off the stack_guard location. + if (Is64Bit) { + TlsReg = X86::FS; + TlsOffset = 0x70; + + BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP) + .addImm(0).addReg(0).addImm(-StackSize).addReg(0); + BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg) + .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); + } else { + TlsReg = X86::GS; + TlsOffset = 0x30; + + BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) + .addImm(0).addReg(0).addImm(-StackSize).addReg(0); + BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg) + .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); + } + + // This jump is taken if SP >= (Stacklet Limit + Stack Space required). + // It jumps to normal execution of the function body. + BuildMI(checkMBB, DL, TII.get(X86::JG_4)).addMBB(&prologueMBB); + + // On 32 bit we first push the arguments size and then the frame size. On 64 + // bit, we pass the stack frame size in r10 and the argument size in r11. + if (Is64Bit) { + // Functions with nested arguments use R10, so it needs to be saved across + // the call to _morestack + + if (IsNested) + BuildMI(allocMBB, DL, TII.get(X86::MOV64rr), X86::RAX).addReg(X86::R10); + + BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R10) + .addImm(StackSize); + BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R11) + .addImm(X86FI->getArgumentStackSize()); + MF.getRegInfo().setPhysRegUsed(X86::R10); + MF.getRegInfo().setPhysRegUsed(X86::R11); + } else { + // Since we'll call __morestack, stack alignment needs to be preserved. + BuildMI(allocMBB, DL, TII.get(X86::SUB32ri), X86::ESP).addReg(X86::ESP) + .addImm(8); + BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) + .addImm(X86FI->getArgumentStackSize()); + BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) + .addImm(StackSize); + } + + // __morestack is in libgcc + if (Is64Bit) + BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack"); + else + BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) + .addExternalSymbol("__morestack"); + + // __morestack only seems to remove 8 bytes off the stack. Add back the + // additional 8 bytes we added before pushing the arguments. + if (!Is64Bit) + BuildMI(allocMBB, DL, TII.get(X86::ADD32ri), X86::ESP).addReg(X86::ESP) + .addImm(8); + BuildMI(allocMBB, DL, TII.get(X86::RET)); + + if (IsNested) + BuildMI(restoreR10MBB, DL, TII.get(X86::MOV64rr), X86::R10) + .addReg(X86::RAX); + + if (IsNested) { + allocMBB->addSuccessor(restoreR10MBB); + restoreR10MBB->addSuccessor(&prologueMBB); + } else { + allocMBB->addSuccessor(&prologueMBB); + } + + checkMBB->addSuccessor(allocMBB); + checkMBB->addSuccessor(&prologueMBB); + +#ifdef XDEBUG + MF.verify(); +#endif +} diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h new file mode 100644 index 0000000..6f49064 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h @@ -0,0 +1,68 @@ +//=-- X86TargetFrameLowering.h - Define frame lowering for X86 ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class implements X86-specific bits of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86_FRAMELOWERING_H +#define X86_FRAMELOWERING_H + +#include "X86Subtarget.h" +#include "llvm/MC/MCDwarf.h" +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + class MCSymbol; + class X86TargetMachine; + +class X86FrameLowering : public TargetFrameLowering { + const X86TargetMachine &TM; + const X86Subtarget &STI; +public: + explicit X86FrameLowering(const X86TargetMachine &tm, const X86Subtarget &sti) + : TargetFrameLowering(StackGrowsDown, + sti.getStackAlignment(), + (sti.is64Bit() ? -8 : -4)), + TM(tm), STI(sti) { + } + + void emitCalleeSavedFrameMoves(MachineFunction &MF, MCSymbol *Label, + unsigned FramePtr) const; + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF) const; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + + void adjustForSegmentedStacks(MachineFunction &MF) const; + + void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS = NULL) const; + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector<CalleeSavedInfo> &CSI, + const TargetRegisterInfo *TRI) const; + + bool hasFP(const MachineFunction &MF) const; + bool hasReservedCallFrame(const MachineFunction &MF) const; + + int getFrameIndexOffset(const MachineFunction &MF, int FI) const; + uint32_t getCompactUnwindEncoding(MachineFunction &MF) const; +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp new file mode 100644 index 0000000..02b0ff2 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -0,0 +1,2259 @@ +//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a DAG pattern matching instruction selector for X86, +// converting from a legalized dag to a X86 dag. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-isel" +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/Support/CFG.h" +#include "llvm/Type.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); + +//===----------------------------------------------------------------------===// +// Pattern Matcher Implementation +//===----------------------------------------------------------------------===// + +namespace { + /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses + /// SDValue's instead of register numbers for the leaves of the matched + /// tree. + struct X86ISelAddressMode { + enum { + RegBase, + FrameIndexBase + } BaseType; + + // This is really a union, discriminated by BaseType! + SDValue Base_Reg; + int Base_FrameIndex; + + unsigned Scale; + SDValue IndexReg; + int32_t Disp; + SDValue Segment; + const GlobalValue *GV; + const Constant *CP; + const BlockAddress *BlockAddr; + const char *ES; + int JT; + unsigned Align; // CP alignment. + unsigned char SymbolFlags; // X86II::MO_* + + X86ISelAddressMode() + : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), + Segment(), GV(0), CP(0), BlockAddr(0), ES(0), JT(-1), Align(0), + SymbolFlags(X86II::MO_NO_FLAG) { + } + + bool hasSymbolicDisplacement() const { + return GV != 0 || CP != 0 || ES != 0 || JT != -1 || BlockAddr != 0; + } + + bool hasBaseOrIndexReg() const { + return IndexReg.getNode() != 0 || Base_Reg.getNode() != 0; + } + + /// isRIPRelative - Return true if this addressing mode is already RIP + /// relative. + bool isRIPRelative() const { + if (BaseType != RegBase) return false; + if (RegisterSDNode *RegNode = + dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode())) + return RegNode->getReg() == X86::RIP; + return false; + } + + void setBaseReg(SDValue Reg) { + BaseType = RegBase; + Base_Reg = Reg; + } + + void dump() { + dbgs() << "X86ISelAddressMode " << this << '\n'; + dbgs() << "Base_Reg "; + if (Base_Reg.getNode() != 0) + Base_Reg.getNode()->dump(); + else + dbgs() << "nul"; + dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n' + << " Scale" << Scale << '\n' + << "IndexReg "; + if (IndexReg.getNode() != 0) + IndexReg.getNode()->dump(); + else + dbgs() << "nul"; + dbgs() << " Disp " << Disp << '\n' + << "GV "; + if (GV) + GV->dump(); + else + dbgs() << "nul"; + dbgs() << " CP "; + if (CP) + CP->dump(); + else + dbgs() << "nul"; + dbgs() << '\n' + << "ES "; + if (ES) + dbgs() << ES; + else + dbgs() << "nul"; + dbgs() << " JT" << JT << " Align" << Align << '\n'; + } + }; +} + +namespace { + //===--------------------------------------------------------------------===// + /// ISel - X86 specific code to select X86 machine instructions for + /// SelectionDAG operations. + /// + class X86DAGToDAGISel : public SelectionDAGISel { + /// X86Lowering - This object fully describes how to lower LLVM code to an + /// X86-specific SelectionDAG. + const X86TargetLowering &X86Lowering; + + /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget *Subtarget; + + /// OptForSize - If true, selector should try to optimize for code size + /// instead of performance. + bool OptForSize; + + public: + explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) + : SelectionDAGISel(tm, OptLevel), + X86Lowering(*tm.getTargetLowering()), + Subtarget(&tm.getSubtarget<X86Subtarget>()), + OptForSize(false) {} + + virtual const char *getPassName() const { + return "X86 DAG->DAG Instruction Selection"; + } + + virtual void EmitFunctionEntryCode(); + + virtual bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const; + + virtual void PreprocessISelDAG(); + + inline bool immSext8(SDNode *N) const { + return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue()); + } + + // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit + // sign extended field. + inline bool i64immSExt32(SDNode *N) const { + uint64_t v = cast<ConstantSDNode>(N)->getZExtValue(); + return (int64_t)v == (int32_t)v; + } + +// Include the pieces autogenerated from the target description. +#include "X86GenDAGISel.inc" + + private: + SDNode *Select(SDNode *N); + SDNode *SelectAtomic64(SDNode *Node, unsigned Opc); + SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT); + SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT); + + bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); + bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); + bool MatchWrapper(SDValue N, X86ISelAddressMode &AM); + bool MatchAddress(SDValue N, X86ISelAddressMode &AM); + bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, + unsigned Depth); + bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM); + bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, SDValue &Disp, + SDValue &Segment); + bool SelectLEAAddr(SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, SDValue &Disp, + SDValue &Segment); + bool SelectTLSADDRAddr(SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, SDValue &Disp, + SDValue &Segment); + bool SelectScalarSSELoad(SDNode *Root, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment, + SDValue &NodeWithChain); + + bool TryFoldLoad(SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment); + + /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for + /// inline asm expressions. + virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op, + char ConstraintCode, + std::vector<SDValue> &OutOps); + + void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI); + + inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ? + CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI.getPointerTy()) : + AM.Base_Reg; + Scale = getI8Imm(AM.Scale); + Index = AM.IndexReg; + // These are 32-bit even in 64-bit mode since RIP relative offset + // is 32-bit. + if (AM.GV) + Disp = CurDAG->getTargetGlobalAddress(AM.GV, DebugLoc(), + MVT::i32, AM.Disp, + AM.SymbolFlags); + else if (AM.CP) + Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, + AM.Align, AM.Disp, AM.SymbolFlags); + else if (AM.ES) + Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags); + else if (AM.JT != -1) + Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags); + else if (AM.BlockAddr) + Disp = CurDAG->getBlockAddress(AM.BlockAddr, MVT::i32, + true, AM.SymbolFlags); + else + Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32); + + if (AM.Segment.getNode()) + Segment = AM.Segment; + else + Segment = CurDAG->getRegister(0, MVT::i32); + } + + /// getI8Imm - Return a target constant with the specified value, of type + /// i8. + inline SDValue getI8Imm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i8); + } + + /// getI32Imm - Return a target constant with the specified value, of type + /// i32. + inline SDValue getI32Imm(unsigned Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i32); + } + + /// getGlobalBaseReg - Return an SDNode that returns the value of + /// the global base register. Output instructions required to + /// initialize the global base register, if necessary. + /// + SDNode *getGlobalBaseReg(); + + /// getTargetMachine - Return a reference to the TargetMachine, casted + /// to the target-specific type. + const X86TargetMachine &getTargetMachine() { + return static_cast<const X86TargetMachine &>(TM); + } + + /// getInstrInfo - Return a reference to the TargetInstrInfo, casted + /// to the target-specific type. + const X86InstrInfo *getInstrInfo() { + return getTargetMachine().getInstrInfo(); + } + }; +} + + +bool +X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { + if (OptLevel == CodeGenOpt::None) return false; + + if (!N.hasOneUse()) + return false; + + if (N.getOpcode() != ISD::LOAD) + return true; + + // If N is a load, do additional profitability checks. + if (U == Root) { + switch (U->getOpcode()) { + default: break; + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::AND: + case X86ISD::XOR: + case X86ISD::OR: + case ISD::ADD: + case ISD::ADDC: + case ISD::ADDE: + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + SDValue Op1 = U->getOperand(1); + + // If the other operand is a 8-bit immediate we should fold the immediate + // instead. This reduces code size. + // e.g. + // movl 4(%esp), %eax + // addl $4, %eax + // vs. + // movl $4, %eax + // addl 4(%esp), %eax + // The former is 2 bytes shorter. In case where the increment is 1, then + // the saving can be 4 bytes (by using incl %eax). + if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) + if (Imm->getAPIntValue().isSignedIntN(8)) + return false; + + // If the other operand is a TLS address, we should fold it instead. + // This produces + // movl %gs:0, %eax + // leal i@NTPOFF(%eax), %eax + // instead of + // movl $i@NTPOFF, %eax + // addl %gs:0, %eax + // if the block also has an access to a second TLS address this will save + // a load. + // FIXME: This is probably also true for non TLS addresses. + if (Op1.getOpcode() == X86ISD::Wrapper) { + SDValue Val = Op1.getOperand(0); + if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) + return false; + } + } + } + } + + return true; +} + +/// MoveBelowCallOrigChain - Replace the original chain operand of the call with +/// load's chain operand and move load below the call's chain operand. +static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, + SDValue Call, SDValue OrigChain) { + SmallVector<SDValue, 8> Ops; + SDValue Chain = OrigChain.getOperand(0); + if (Chain.getNode() == Load.getNode()) + Ops.push_back(Load.getOperand(0)); + else { + assert(Chain.getOpcode() == ISD::TokenFactor && + "Unexpected chain operand"); + for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) + if (Chain.getOperand(i).getNode() == Load.getNode()) + Ops.push_back(Load.getOperand(0)); + else + Ops.push_back(Chain.getOperand(i)); + SDValue NewChain = + CurDAG->getNode(ISD::TokenFactor, Load.getDebugLoc(), + MVT::Other, &Ops[0], Ops.size()); + Ops.clear(); + Ops.push_back(NewChain); + } + for (unsigned i = 1, e = OrigChain.getNumOperands(); i != e; ++i) + Ops.push_back(OrigChain.getOperand(i)); + CurDAG->UpdateNodeOperands(OrigChain.getNode(), &Ops[0], Ops.size()); + CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0), + Load.getOperand(1), Load.getOperand(2)); + Ops.clear(); + Ops.push_back(SDValue(Load.getNode(), 1)); + for (unsigned i = 1, e = Call.getNode()->getNumOperands(); i != e; ++i) + Ops.push_back(Call.getOperand(i)); + CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], Ops.size()); +} + +/// isCalleeLoad - Return true if call address is a load and it can be +/// moved below CALLSEQ_START and the chains leading up to the call. +/// Return the CALLSEQ_START by reference as a second output. +/// In the case of a tail call, there isn't a callseq node between the call +/// chain and the load. +static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { + if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) + return false; + LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode()); + if (!LD || + LD->isVolatile() || + LD->getAddressingMode() != ISD::UNINDEXED || + LD->getExtensionType() != ISD::NON_EXTLOAD) + return false; + + // Now let's find the callseq_start. + while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) { + if (!Chain.hasOneUse()) + return false; + Chain = Chain.getOperand(0); + } + + if (!Chain.getNumOperands()) + return false; + if (Chain.getOperand(0).getNode() == Callee.getNode()) + return true; + if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor && + Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) && + Callee.getValue(1).hasOneUse()) + return true; + return false; +} + +void X86DAGToDAGISel::PreprocessISelDAG() { + // OptForSize is used in pattern predicates that isel is matching. + OptForSize = MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize); + + for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), + E = CurDAG->allnodes_end(); I != E; ) { + SDNode *N = I++; // Preincrement iterator to avoid invalidation issues. + + if (OptLevel != CodeGenOpt::None && + (N->getOpcode() == X86ISD::CALL || + N->getOpcode() == X86ISD::TC_RETURN)) { + /// Also try moving call address load from outside callseq_start to just + /// before the call to allow it to be folded. + /// + /// [Load chain] + /// ^ + /// | + /// [Load] + /// ^ ^ + /// | | + /// / \-- + /// / | + ///[CALLSEQ_START] | + /// ^ | + /// | | + /// [LOAD/C2Reg] | + /// | | + /// \ / + /// \ / + /// [CALL] + bool HasCallSeq = N->getOpcode() == X86ISD::CALL; + SDValue Chain = N->getOperand(0); + SDValue Load = N->getOperand(1); + if (!isCalleeLoad(Load, Chain, HasCallSeq)) + continue; + MoveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); + ++NumLoadMoved; + continue; + } + + // Lower fpround and fpextend nodes that target the FP stack to be store and + // load to the stack. This is a gross hack. We would like to simply mark + // these as being illegal, but when we do that, legalize produces these when + // it expands calls, then expands these in the same legalize pass. We would + // like dag combine to be able to hack on these between the call expansion + // and the node legalization. As such this pass basically does "really + // late" legalization of these inline with the X86 isel pass. + // FIXME: This should only happen when not compiled with -O0. + if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND) + continue; + + EVT SrcVT = N->getOperand(0).getValueType(); + EVT DstVT = N->getValueType(0); + + // If any of the sources are vectors, no fp stack involved. + if (SrcVT.isVector() || DstVT.isVector()) + continue; + + // If the source and destination are SSE registers, then this is a legal + // conversion that should not be lowered. + bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT); + bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT); + if (SrcIsSSE && DstIsSSE) + continue; + + if (!SrcIsSSE && !DstIsSSE) { + // If this is an FPStack extension, it is a noop. + if (N->getOpcode() == ISD::FP_EXTEND) + continue; + // If this is a value-preserving FPStack truncation, it is a noop. + if (N->getConstantOperandVal(1)) + continue; + } + + // Here we could have an FP stack truncation or an FPStack <-> SSE convert. + // FPStack has extload and truncstore. SSE can fold direct loads into other + // operations. Based on this, decide what we want to do. + EVT MemVT; + if (N->getOpcode() == ISD::FP_ROUND) + MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'. + else + MemVT = SrcIsSSE ? SrcVT : DstVT; + + SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); + DebugLoc dl = N->getDebugLoc(); + + // FIXME: optimize the case where the src/dest is a load or store? + SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, + N->getOperand(0), + MemTmp, MachinePointerInfo(), MemVT, + false, false, 0); + SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, + MachinePointerInfo(), + MemVT, false, false, 0); + + // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the + // extload we created. This will cause general havok on the dag because + // anything below the conversion could be folded into other existing nodes. + // To avoid invalidating 'I', back it up to the convert node. + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); + + // Now that we did that, the node is dead. Increment the iterator to the + // next node to process, then delete N. + ++I; + CurDAG->DeleteNode(N); + } +} + + +/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in +/// the main function. +void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB, + MachineFrameInfo *MFI) { + const TargetInstrInfo *TII = TM.getInstrInfo(); + if (Subtarget->isTargetCygMing()) { + unsigned CallOp = + Subtarget->is64Bit() ? X86::WINCALL64pcrel32 : X86::CALLpcrel32; + BuildMI(BB, DebugLoc(), + TII->get(CallOp)).addExternalSymbol("__main"); + } +} + +void X86DAGToDAGISel::EmitFunctionEntryCode() { + // If this is main, emit special code for main. + if (const Function *Fn = MF->getFunction()) + if (Fn->hasExternalLinkage() && Fn->getName() == "main") + EmitSpecialCodeForMain(MF->begin(), MF->getFrameInfo()); +} + +static bool isDispSafeForFrameIndex(int64_t Val) { + // On 64-bit platforms, we can run into an issue where a frame index + // includes a displacement that, when added to the explicit displacement, + // will overflow the displacement field. Assuming that the frame index + // displacement fits into a 31-bit integer (which is only slightly more + // aggressive than the current fundamental assumption that it fits into + // a 32-bit integer), a 31-bit disp should always be safe. + return isInt<31>(Val); +} + +bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset, + X86ISelAddressMode &AM) { + int64_t Val = AM.Disp + Offset; + CodeModel::Model M = TM.getCodeModel(); + if (Subtarget->is64Bit()) { + if (!X86::isOffsetSuitableForCodeModel(Val, M, + AM.hasSymbolicDisplacement())) + return true; + // In addition to the checks required for a register base, check that + // we do not try to use an unsafe Disp with a frame index. + if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && + !isDispSafeForFrameIndex(Val)) + return true; + } + AM.Disp = Val; + return false; + +} + +bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ + SDValue Address = N->getOperand(1); + + // load gs:0 -> GS segment register. + // load fs:0 -> FS segment register. + // + // This optimization is valid because the GNU TLS model defines that + // gs:0 (or fs:0 on X86-64) contains its own address. + // For more information see http://people.redhat.com/drepper/tls.pdf + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address)) + if (C->getSExtValue() == 0 && AM.Segment.getNode() == 0 && + Subtarget->isTargetELF()) + switch (N->getPointerInfo().getAddrSpace()) { + case 256: + AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); + return false; + case 257: + AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); + return false; + } + + return true; +} + +/// MatchWrapper - Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes +/// into an addressing mode. These wrap things that will resolve down into a +/// symbol reference. If no match is possible, this returns true, otherwise it +/// returns false. +bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { + // If the addressing mode already has a symbol as the displacement, we can + // never match another symbol. + if (AM.hasSymbolicDisplacement()) + return true; + + SDValue N0 = N.getOperand(0); + CodeModel::Model M = TM.getCodeModel(); + + // Handle X86-64 rip-relative addresses. We check this before checking direct + // folding because RIP is preferable to non-RIP accesses. + if (Subtarget->is64Bit() && + // Under X86-64 non-small code model, GV (and friends) are 64-bits, so + // they cannot be folded into immediate fields. + // FIXME: This can be improved for kernel and other models? + (M == CodeModel::Small || M == CodeModel::Kernel) && + // Base and index reg must be 0 in order to use %rip as base and lowering + // must allow RIP. + !AM.hasBaseOrIndexReg() && N.getOpcode() == X86ISD::WrapperRIP) { + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { + X86ISelAddressMode Backup = AM; + AM.GV = G->getGlobal(); + AM.SymbolFlags = G->getTargetFlags(); + if (FoldOffsetIntoAddress(G->getOffset(), AM)) { + AM = Backup; + return true; + } + } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) { + X86ISelAddressMode Backup = AM; + AM.CP = CP->getConstVal(); + AM.Align = CP->getAlignment(); + AM.SymbolFlags = CP->getTargetFlags(); + if (FoldOffsetIntoAddress(CP->getOffset(), AM)) { + AM = Backup; + return true; + } + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { + AM.ES = S->getSymbol(); + AM.SymbolFlags = S->getTargetFlags(); + } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { + AM.JT = J->getIndex(); + AM.SymbolFlags = J->getTargetFlags(); + } else { + AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress(); + AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags(); + } + + if (N.getOpcode() == X86ISD::WrapperRIP) + AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64)); + return false; + } + + // Handle the case when globals fit in our immediate field: This is true for + // X86-32 always and X86-64 when in -static -mcmodel=small mode. In 64-bit + // mode, this results in a non-RIP-relative computation. + if (!Subtarget->is64Bit() || + ((M == CodeModel::Small || M == CodeModel::Kernel) && + TM.getRelocationModel() == Reloc::Static)) { + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { + AM.GV = G->getGlobal(); + AM.Disp += G->getOffset(); + AM.SymbolFlags = G->getTargetFlags(); + } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) { + AM.CP = CP->getConstVal(); + AM.Align = CP->getAlignment(); + AM.Disp += CP->getOffset(); + AM.SymbolFlags = CP->getTargetFlags(); + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { + AM.ES = S->getSymbol(); + AM.SymbolFlags = S->getTargetFlags(); + } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { + AM.JT = J->getIndex(); + AM.SymbolFlags = J->getTargetFlags(); + } else { + AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress(); + AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags(); + } + return false; + } + + return true; +} + +/// MatchAddress - Add the specified node to the specified addressing mode, +/// returning true if it cannot be done. This just pattern matches for the +/// addressing mode. +bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) { + if (MatchAddressRecursively(N, AM, 0)) + return true; + + // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has + // a smaller encoding and avoids a scaled-index. + if (AM.Scale == 2 && + AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() == 0) { + AM.Base_Reg = AM.IndexReg; + AM.Scale = 1; + } + + // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, + // because it has a smaller encoding. + // TODO: Which other code models can use this? + if (TM.getCodeModel() == CodeModel::Small && + Subtarget->is64Bit() && + AM.Scale == 1 && + AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() == 0 && + AM.IndexReg.getNode() == 0 && + AM.SymbolFlags == X86II::MO_NO_FLAG && + AM.hasSymbolicDisplacement()) + AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); + + return false; +} + +bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, + unsigned Depth) { + DebugLoc dl = N.getDebugLoc(); + DEBUG({ + dbgs() << "MatchAddress: "; + AM.dump(); + }); + // Limit recursion. + if (Depth > 5) + return MatchAddressBase(N, AM); + + // If this is already a %rip relative address, we can only merge immediates + // into it. Instead of handling this in every case, we handle it here. + // RIP relative addressing: %rip + 32-bit displacement! + if (AM.isRIPRelative()) { + // FIXME: JumpTable and ExternalSymbol address currently don't like + // displacements. It isn't very important, but this should be fixed for + // consistency. + if (!AM.ES && AM.JT != -1) return true; + + if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N)) + if (!FoldOffsetIntoAddress(Cst->getSExtValue(), AM)) + return false; + return true; + } + + switch (N.getOpcode()) { + default: break; + case ISD::Constant: { + uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); + if (!FoldOffsetIntoAddress(Val, AM)) + return false; + break; + } + + case X86ISD::Wrapper: + case X86ISD::WrapperRIP: + if (!MatchWrapper(N, AM)) + return false; + break; + + case ISD::LOAD: + if (!MatchLoadInAddress(cast<LoadSDNode>(N), AM)) + return false; + break; + + case ISD::FrameIndex: + if (AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() == 0 && + (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) { + AM.BaseType = X86ISelAddressMode::FrameIndexBase; + AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex(); + return false; + } + break; + + case ISD::SHL: + if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) + break; + + if (ConstantSDNode + *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) { + unsigned Val = CN->getZExtValue(); + // Note that we handle x<<1 as (,x,2) rather than (x,x) here so + // that the base operand remains free for further matching. If + // the base doesn't end up getting used, a post-processing step + // in MatchAddress turns (,x,2) into (x,x), which is cheaper. + if (Val == 1 || Val == 2 || Val == 3) { + AM.Scale = 1 << Val; + SDValue ShVal = N.getNode()->getOperand(0); + + // Okay, we know that we have a scale by now. However, if the scaled + // value is an add of something and a constant, we can fold the + // constant into the disp field here. + if (CurDAG->isBaseWithConstantOffset(ShVal)) { + AM.IndexReg = ShVal.getNode()->getOperand(0); + ConstantSDNode *AddVal = + cast<ConstantSDNode>(ShVal.getNode()->getOperand(1)); + uint64_t Disp = AddVal->getSExtValue() << Val; + if (!FoldOffsetIntoAddress(Disp, AM)) + return false; + } + + AM.IndexReg = ShVal; + return false; + } + break; + } + + case ISD::SMUL_LOHI: + case ISD::UMUL_LOHI: + // A mul_lohi where we need the low part can be folded as a plain multiply. + if (N.getResNo() != 0) break; + // FALL THROUGH + case ISD::MUL: + case X86ISD::MUL_IMM: + // X*[3,5,9] -> X+X*[2,4,8] + if (AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() == 0 && + AM.IndexReg.getNode() == 0) { + if (ConstantSDNode + *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) + if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || + CN->getZExtValue() == 9) { + AM.Scale = unsigned(CN->getZExtValue())-1; + + SDValue MulVal = N.getNode()->getOperand(0); + SDValue Reg; + + // Okay, we know that we have a scale by now. However, if the scaled + // value is an add of something and a constant, we can fold the + // constant into the disp field here. + if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && + isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) { + Reg = MulVal.getNode()->getOperand(0); + ConstantSDNode *AddVal = + cast<ConstantSDNode>(MulVal.getNode()->getOperand(1)); + uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); + if (FoldOffsetIntoAddress(Disp, AM)) + Reg = N.getNode()->getOperand(0); + } else { + Reg = N.getNode()->getOperand(0); + } + + AM.IndexReg = AM.Base_Reg = Reg; + return false; + } + } + break; + + case ISD::SUB: { + // Given A-B, if A can be completely folded into the address and + // the index field with the index field unused, use -B as the index. + // This is a win if a has multiple parts that can be folded into + // the address. Also, this saves a mov if the base register has + // other uses, since it avoids a two-address sub instruction, however + // it costs an additional mov if the index register has other uses. + + // Add an artificial use to this node so that we can keep track of + // it if it gets CSE'd with a different node. + HandleSDNode Handle(N); + + // Test if the LHS of the sub can be folded. + X86ISelAddressMode Backup = AM; + if (MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) { + AM = Backup; + break; + } + // Test if the index field is free for use. + if (AM.IndexReg.getNode() || AM.isRIPRelative()) { + AM = Backup; + break; + } + + int Cost = 0; + SDValue RHS = Handle.getValue().getNode()->getOperand(1); + // If the RHS involves a register with multiple uses, this + // transformation incurs an extra mov, due to the neg instruction + // clobbering its operand. + if (!RHS.getNode()->hasOneUse() || + RHS.getNode()->getOpcode() == ISD::CopyFromReg || + RHS.getNode()->getOpcode() == ISD::TRUNCATE || + RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || + (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && + RHS.getNode()->getOperand(0).getValueType() == MVT::i32)) + ++Cost; + // If the base is a register with multiple uses, this + // transformation may save a mov. + if ((AM.BaseType == X86ISelAddressMode::RegBase && + AM.Base_Reg.getNode() && + !AM.Base_Reg.getNode()->hasOneUse()) || + AM.BaseType == X86ISelAddressMode::FrameIndexBase) + --Cost; + // If the folded LHS was interesting, this transformation saves + // address arithmetic. + if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) + + ((AM.Disp != 0) && (Backup.Disp == 0)) + + (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2) + --Cost; + // If it doesn't look like it may be an overall win, don't do it. + if (Cost >= 0) { + AM = Backup; + break; + } + + // Ok, the transformation is legal and appears profitable. Go for it. + SDValue Zero = CurDAG->getConstant(0, N.getValueType()); + SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS); + AM.IndexReg = Neg; + AM.Scale = 1; + + // Insert the new nodes into the topological ordering. + if (Zero.getNode()->getNodeId() == -1 || + Zero.getNode()->getNodeId() > N.getNode()->getNodeId()) { + CurDAG->RepositionNode(N.getNode(), Zero.getNode()); + Zero.getNode()->setNodeId(N.getNode()->getNodeId()); + } + if (Neg.getNode()->getNodeId() == -1 || + Neg.getNode()->getNodeId() > N.getNode()->getNodeId()) { + CurDAG->RepositionNode(N.getNode(), Neg.getNode()); + Neg.getNode()->setNodeId(N.getNode()->getNodeId()); + } + return false; + } + + case ISD::ADD: { + // Add an artificial use to this node so that we can keep track of + // it if it gets CSE'd with a different node. + HandleSDNode Handle(N); + + X86ISelAddressMode Backup = AM; + if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) && + !MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) + return false; + AM = Backup; + + // Try again after commuting the operands. + if (!MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)&& + !MatchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1)) + return false; + AM = Backup; + + // If we couldn't fold both operands into the address at the same time, + // see if we can just put each operand into a register and fold at least + // the add. + if (AM.BaseType == X86ISelAddressMode::RegBase && + !AM.Base_Reg.getNode() && + !AM.IndexReg.getNode()) { + N = Handle.getValue(); + AM.Base_Reg = N.getOperand(0); + AM.IndexReg = N.getOperand(1); + AM.Scale = 1; + return false; + } + N = Handle.getValue(); + break; + } + + case ISD::OR: + // Handle "X | C" as "X + C" iff X is known to have C bits clear. + if (CurDAG->isBaseWithConstantOffset(N)) { + X86ISelAddressMode Backup = AM; + ConstantSDNode *CN = cast<ConstantSDNode>(N.getOperand(1)); + + // Start with the LHS as an addr mode. + if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) && + !FoldOffsetIntoAddress(CN->getSExtValue(), AM)) + return false; + AM = Backup; + } + break; + + case ISD::AND: { + // Perform some heroic transforms on an and of a constant-count shift + // with a constant to enable use of the scaled offset field. + + SDValue Shift = N.getOperand(0); + if (Shift.getNumOperands() != 2) break; + + // Scale must not be used already. + if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break; + + SDValue X = Shift.getOperand(0); + ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N.getOperand(1)); + ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); + if (!C1 || !C2) break; + + // Handle "(X >> (8-C1)) & C2" as "(X >> 8) & 0xff)" if safe. This + // allows us to convert the shift and and into an h-register extract and + // a scaled index. + if (Shift.getOpcode() == ISD::SRL && Shift.hasOneUse()) { + unsigned ScaleLog = 8 - C1->getZExtValue(); + if (ScaleLog > 0 && ScaleLog < 4 && + C2->getZExtValue() == (UINT64_C(0xff) << ScaleLog)) { + SDValue Eight = CurDAG->getConstant(8, MVT::i8); + SDValue Mask = CurDAG->getConstant(0xff, N.getValueType()); + SDValue Srl = CurDAG->getNode(ISD::SRL, dl, N.getValueType(), + X, Eight); + SDValue And = CurDAG->getNode(ISD::AND, dl, N.getValueType(), + Srl, Mask); + SDValue ShlCount = CurDAG->getConstant(ScaleLog, MVT::i8); + SDValue Shl = CurDAG->getNode(ISD::SHL, dl, N.getValueType(), + And, ShlCount); + + // Insert the new nodes into the topological ordering. + if (Eight.getNode()->getNodeId() == -1 || + Eight.getNode()->getNodeId() > X.getNode()->getNodeId()) { + CurDAG->RepositionNode(X.getNode(), Eight.getNode()); + Eight.getNode()->setNodeId(X.getNode()->getNodeId()); + } + if (Mask.getNode()->getNodeId() == -1 || + Mask.getNode()->getNodeId() > X.getNode()->getNodeId()) { + CurDAG->RepositionNode(X.getNode(), Mask.getNode()); + Mask.getNode()->setNodeId(X.getNode()->getNodeId()); + } + if (Srl.getNode()->getNodeId() == -1 || + Srl.getNode()->getNodeId() > Shift.getNode()->getNodeId()) { + CurDAG->RepositionNode(Shift.getNode(), Srl.getNode()); + Srl.getNode()->setNodeId(Shift.getNode()->getNodeId()); + } + if (And.getNode()->getNodeId() == -1 || + And.getNode()->getNodeId() > N.getNode()->getNodeId()) { + CurDAG->RepositionNode(N.getNode(), And.getNode()); + And.getNode()->setNodeId(N.getNode()->getNodeId()); + } + if (ShlCount.getNode()->getNodeId() == -1 || + ShlCount.getNode()->getNodeId() > X.getNode()->getNodeId()) { + CurDAG->RepositionNode(X.getNode(), ShlCount.getNode()); + ShlCount.getNode()->setNodeId(N.getNode()->getNodeId()); + } + if (Shl.getNode()->getNodeId() == -1 || + Shl.getNode()->getNodeId() > N.getNode()->getNodeId()) { + CurDAG->RepositionNode(N.getNode(), Shl.getNode()); + Shl.getNode()->setNodeId(N.getNode()->getNodeId()); + } + CurDAG->ReplaceAllUsesWith(N, Shl); + AM.IndexReg = And; + AM.Scale = (1 << ScaleLog); + return false; + } + } + + // Handle "(X << C1) & C2" as "(X & (C2>>C1)) << C1" if safe and if this + // allows us to fold the shift into this addressing mode. + if (Shift.getOpcode() != ISD::SHL) break; + + // Not likely to be profitable if either the AND or SHIFT node has more + // than one use (unless all uses are for address computation). Besides, + // isel mechanism requires their node ids to be reused. + if (!N.hasOneUse() || !Shift.hasOneUse()) + break; + + // Verify that the shift amount is something we can fold. + unsigned ShiftCst = C1->getZExtValue(); + if (ShiftCst != 1 && ShiftCst != 2 && ShiftCst != 3) + break; + + // Get the new AND mask, this folds to a constant. + SDValue NewANDMask = CurDAG->getNode(ISD::SRL, dl, N.getValueType(), + SDValue(C2, 0), SDValue(C1, 0)); + SDValue NewAND = CurDAG->getNode(ISD::AND, dl, N.getValueType(), X, + NewANDMask); + SDValue NewSHIFT = CurDAG->getNode(ISD::SHL, dl, N.getValueType(), + NewAND, SDValue(C1, 0)); + + // Insert the new nodes into the topological ordering. + if (C1->getNodeId() > X.getNode()->getNodeId()) { + CurDAG->RepositionNode(X.getNode(), C1); + C1->setNodeId(X.getNode()->getNodeId()); + } + if (NewANDMask.getNode()->getNodeId() == -1 || + NewANDMask.getNode()->getNodeId() > X.getNode()->getNodeId()) { + CurDAG->RepositionNode(X.getNode(), NewANDMask.getNode()); + NewANDMask.getNode()->setNodeId(X.getNode()->getNodeId()); + } + if (NewAND.getNode()->getNodeId() == -1 || + NewAND.getNode()->getNodeId() > Shift.getNode()->getNodeId()) { + CurDAG->RepositionNode(Shift.getNode(), NewAND.getNode()); + NewAND.getNode()->setNodeId(Shift.getNode()->getNodeId()); + } + if (NewSHIFT.getNode()->getNodeId() == -1 || + NewSHIFT.getNode()->getNodeId() > N.getNode()->getNodeId()) { + CurDAG->RepositionNode(N.getNode(), NewSHIFT.getNode()); + NewSHIFT.getNode()->setNodeId(N.getNode()->getNodeId()); + } + + CurDAG->ReplaceAllUsesWith(N, NewSHIFT); + + AM.Scale = 1 << ShiftCst; + AM.IndexReg = NewAND; + return false; + } + } + + return MatchAddressBase(N, AM); +} + +/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the +/// specified addressing mode without any further recursion. +bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) { + // Is the base register already occupied? + if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { + // If so, check to see if the scale index register is set. + if (AM.IndexReg.getNode() == 0) { + AM.IndexReg = N; + AM.Scale = 1; + return false; + } + + // Otherwise, we cannot select it. + return true; + } + + // Default, generate it as a register. + AM.BaseType = X86ISelAddressMode::RegBase; + AM.Base_Reg = N; + return false; +} + +/// SelectAddr - returns true if it is able pattern match an addressing mode. +/// It returns the operands which make up the maximal addressing mode it can +/// match by reference. +/// +/// Parent is the parent node of the addr operand that is being matched. It +/// is always a load, store, atomic node, or null. It is only null when +/// checking memory operands for inline asm nodes. +bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + X86ISelAddressMode AM; + + if (Parent && + // This list of opcodes are all the nodes that have an "addr:$ptr" operand + // that are not a MemSDNode, and thus don't have proper addrspace info. + Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme + Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores + Parent->getOpcode() != X86ISD::TLSCALL) { // Fixme + unsigned AddrSpace = + cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); + // AddrSpace 256 -> GS, 257 -> FS. + if (AddrSpace == 256) + AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); + if (AddrSpace == 257) + AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); + } + + if (MatchAddress(N, AM)) + return false; + + EVT VT = N.getValueType(); + if (AM.BaseType == X86ISelAddressMode::RegBase) { + if (!AM.Base_Reg.getNode()) + AM.Base_Reg = CurDAG->getRegister(0, VT); + } + + if (!AM.IndexReg.getNode()) + AM.IndexReg = CurDAG->getRegister(0, VT); + + getAddressOperands(AM, Base, Scale, Index, Disp, Segment); + return true; +} + +/// SelectScalarSSELoad - Match a scalar SSE load. In particular, we want to +/// match a load whose top elements are either undef or zeros. The load flavor +/// is derived from the type of N, which is either v4f32 or v2f64. +/// +/// We also return: +/// PatternChainNode: this is the matched node that has a chain input and +/// output. +bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, + SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment, + SDValue &PatternNodeWithChain) { + if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) { + PatternNodeWithChain = N.getOperand(0); + if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) && + PatternNodeWithChain.hasOneUse() && + IsProfitableToFold(N.getOperand(0), N.getNode(), Root) && + IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) { + LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); + if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) + return false; + return true; + } + } + + // Also handle the case where we explicitly require zeros in the top + // elements. This is a vector shuffle from the zero vector. + if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() && + // Check to see if the top elements are all zeros (or bitcast of zeros). + N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && + N.getOperand(0).getNode()->hasOneUse() && + ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) && + N.getOperand(0).getOperand(0).hasOneUse() && + IsProfitableToFold(N.getOperand(0), N.getNode(), Root) && + IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) { + // Okay, this is a zero extending load. Fold it. + LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0)); + if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) + return false; + PatternNodeWithChain = SDValue(LD, 0); + return true; + } + return false; +} + + +/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing +/// mode it matches can be cost effectively emitted as an LEA instruction. +bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { + X86ISelAddressMode AM; + + // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support + // segments. + SDValue Copy = AM.Segment; + SDValue T = CurDAG->getRegister(0, MVT::i32); + AM.Segment = T; + if (MatchAddress(N, AM)) + return false; + assert (T == AM.Segment); + AM.Segment = Copy; + + EVT VT = N.getValueType(); + unsigned Complexity = 0; + if (AM.BaseType == X86ISelAddressMode::RegBase) + if (AM.Base_Reg.getNode()) + Complexity = 1; + else + AM.Base_Reg = CurDAG->getRegister(0, VT); + else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) + Complexity = 4; + + if (AM.IndexReg.getNode()) + Complexity++; + else + AM.IndexReg = CurDAG->getRegister(0, VT); + + // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with + // a simple shift. + if (AM.Scale > 1) + Complexity++; + + // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA + // to a LEA. This is determined with some expermentation but is by no means + // optimal (especially for code size consideration). LEA is nice because of + // its three-address nature. Tweak the cost function again when we can run + // convertToThreeAddress() at register allocation time. + if (AM.hasSymbolicDisplacement()) { + // For X86-64, we should always use lea to materialize RIP relative + // addresses. + if (Subtarget->is64Bit()) + Complexity = 4; + else + Complexity += 2; + } + + if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode())) + Complexity++; + + // If it isn't worth using an LEA, reject it. + if (Complexity <= 2) + return false; + + getAddressOperands(AM, Base, Scale, Index, Disp, Segment); + return true; +} + +/// SelectTLSADDRAddr - This is only run on TargetGlobalTLSAddress nodes. +bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { + assert(N.getOpcode() == ISD::TargetGlobalTLSAddress); + const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N); + + X86ISelAddressMode AM; + AM.GV = GA->getGlobal(); + AM.Disp += GA->getOffset(); + AM.Base_Reg = CurDAG->getRegister(0, N.getValueType()); + AM.SymbolFlags = GA->getTargetFlags(); + + if (N.getValueType() == MVT::i32) { + AM.Scale = 1; + AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32); + } else { + AM.IndexReg = CurDAG->getRegister(0, MVT::i64); + } + + getAddressOperands(AM, Base, Scale, Index, Disp, Segment); + return true; +} + + +bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { + if (!ISD::isNON_EXTLoad(N.getNode()) || + !IsProfitableToFold(N, P, P) || + !IsLegalToFold(N, P, P, OptLevel)) + return false; + + return SelectAddr(N.getNode(), + N.getOperand(1), Base, Scale, Index, Disp, Segment); +} + +/// getGlobalBaseReg - Return an SDNode that returns the value of +/// the global base register. Output instructions required to +/// initialize the global base register, if necessary. +/// +SDNode *X86DAGToDAGISel::getGlobalBaseReg() { + unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); + return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode(); +} + +SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { + SDValue Chain = Node->getOperand(0); + SDValue In1 = Node->getOperand(1); + SDValue In2L = Node->getOperand(2); + SDValue In2H = Node->getOperand(3); + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) + return NULL; + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); + const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain}; + SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), + MVT::i32, MVT::i32, MVT::Other, Ops, + array_lengthof(Ops)); + cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1); + return ResNode; +} + +// FIXME: Figure out some way to unify this with the 'or' and other code +// below. +SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) { + if (Node->hasAnyUseOfValue(0)) + return 0; + + // Optimize common patterns for __sync_add_and_fetch and + // __sync_sub_and_fetch where the result is not used. This allows us + // to use "lock" version of add, sub, inc, dec instructions. + // FIXME: Do not use special instructions but instead add the "lock" + // prefix to the target node somehow. The extra information will then be + // transferred to machine instruction and it denotes the prefix. + SDValue Chain = Node->getOperand(0); + SDValue Ptr = Node->getOperand(1); + SDValue Val = Node->getOperand(2); + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) + return 0; + + bool isInc = false, isDec = false, isSub = false, isCN = false; + ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val); + if (CN && CN->getSExtValue() == (int32_t)CN->getSExtValue()) { + isCN = true; + int64_t CNVal = CN->getSExtValue(); + if (CNVal == 1) + isInc = true; + else if (CNVal == -1) + isDec = true; + else if (CNVal >= 0) + Val = CurDAG->getTargetConstant(CNVal, NVT); + else { + isSub = true; + Val = CurDAG->getTargetConstant(-CNVal, NVT); + } + } else if (Val.hasOneUse() && + Val.getOpcode() == ISD::SUB && + X86::isZeroNode(Val.getOperand(0))) { + isSub = true; + Val = Val.getOperand(1); + } + + DebugLoc dl = Node->getDebugLoc(); + unsigned Opc = 0; + switch (NVT.getSimpleVT().SimpleTy) { + default: return 0; + case MVT::i8: + if (isInc) + Opc = X86::LOCK_INC8m; + else if (isDec) + Opc = X86::LOCK_DEC8m; + else if (isSub) { + if (isCN) + Opc = X86::LOCK_SUB8mi; + else + Opc = X86::LOCK_SUB8mr; + } else { + if (isCN) + Opc = X86::LOCK_ADD8mi; + else + Opc = X86::LOCK_ADD8mr; + } + break; + case MVT::i16: + if (isInc) + Opc = X86::LOCK_INC16m; + else if (isDec) + Opc = X86::LOCK_DEC16m; + else if (isSub) { + if (isCN) { + if (immSext8(Val.getNode())) + Opc = X86::LOCK_SUB16mi8; + else + Opc = X86::LOCK_SUB16mi; + } else + Opc = X86::LOCK_SUB16mr; + } else { + if (isCN) { + if (immSext8(Val.getNode())) + Opc = X86::LOCK_ADD16mi8; + else + Opc = X86::LOCK_ADD16mi; + } else + Opc = X86::LOCK_ADD16mr; + } + break; + case MVT::i32: + if (isInc) + Opc = X86::LOCK_INC32m; + else if (isDec) + Opc = X86::LOCK_DEC32m; + else if (isSub) { + if (isCN) { + if (immSext8(Val.getNode())) + Opc = X86::LOCK_SUB32mi8; + else + Opc = X86::LOCK_SUB32mi; + } else + Opc = X86::LOCK_SUB32mr; + } else { + if (isCN) { + if (immSext8(Val.getNode())) + Opc = X86::LOCK_ADD32mi8; + else + Opc = X86::LOCK_ADD32mi; + } else + Opc = X86::LOCK_ADD32mr; + } + break; + case MVT::i64: + if (isInc) + Opc = X86::LOCK_INC64m; + else if (isDec) + Opc = X86::LOCK_DEC64m; + else if (isSub) { + Opc = X86::LOCK_SUB64mr; + if (isCN) { + if (immSext8(Val.getNode())) + Opc = X86::LOCK_SUB64mi8; + else if (i64immSExt32(Val.getNode())) + Opc = X86::LOCK_SUB64mi32; + } + } else { + Opc = X86::LOCK_ADD64mr; + if (isCN) { + if (immSext8(Val.getNode())) + Opc = X86::LOCK_ADD64mi8; + else if (i64immSExt32(Val.getNode())) + Opc = X86::LOCK_ADD64mi32; + } + } + break; + } + + SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, NVT), 0); + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); + if (isInc || isDec) { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain }; + SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 6), 0); + cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); + SDValue RetVals[] = { Undef, Ret }; + return CurDAG->getMergeValues(RetVals, 2, dl).getNode(); + } else { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain }; + SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0); + cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); + SDValue RetVals[] = { Undef, Ret }; + return CurDAG->getMergeValues(RetVals, 2, dl).getNode(); + } +} + +enum AtomicOpc { + OR, + AND, + XOR, + AtomicOpcEnd +}; + +enum AtomicSz { + ConstantI8, + I8, + SextConstantI16, + ConstantI16, + I16, + SextConstantI32, + ConstantI32, + I32, + SextConstantI64, + ConstantI64, + I64, + AtomicSzEnd +}; + +static const unsigned int AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { + { + X86::LOCK_OR8mi, + X86::LOCK_OR8mr, + X86::LOCK_OR16mi8, + X86::LOCK_OR16mi, + X86::LOCK_OR16mr, + X86::LOCK_OR32mi8, + X86::LOCK_OR32mi, + X86::LOCK_OR32mr, + X86::LOCK_OR64mi8, + X86::LOCK_OR64mi32, + X86::LOCK_OR64mr + }, + { + X86::LOCK_AND8mi, + X86::LOCK_AND8mr, + X86::LOCK_AND16mi8, + X86::LOCK_AND16mi, + X86::LOCK_AND16mr, + X86::LOCK_AND32mi8, + X86::LOCK_AND32mi, + X86::LOCK_AND32mr, + X86::LOCK_AND64mi8, + X86::LOCK_AND64mi32, + X86::LOCK_AND64mr + }, + { + X86::LOCK_XOR8mi, + X86::LOCK_XOR8mr, + X86::LOCK_XOR16mi8, + X86::LOCK_XOR16mi, + X86::LOCK_XOR16mr, + X86::LOCK_XOR32mi8, + X86::LOCK_XOR32mi, + X86::LOCK_XOR32mr, + X86::LOCK_XOR64mi8, + X86::LOCK_XOR64mi32, + X86::LOCK_XOR64mr + } +}; + +SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) { + if (Node->hasAnyUseOfValue(0)) + return 0; + + // Optimize common patterns for __sync_or_and_fetch and similar arith + // operations where the result is not used. This allows us to use the "lock" + // version of the arithmetic instruction. + // FIXME: Same as for 'add' and 'sub', try to merge those down here. + SDValue Chain = Node->getOperand(0); + SDValue Ptr = Node->getOperand(1); + SDValue Val = Node->getOperand(2); + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) + return 0; + + // Which index into the table. + enum AtomicOpc Op; + switch (Node->getOpcode()) { + case ISD::ATOMIC_LOAD_OR: + Op = OR; + break; + case ISD::ATOMIC_LOAD_AND: + Op = AND; + break; + case ISD::ATOMIC_LOAD_XOR: + Op = XOR; + break; + default: + return 0; + } + + bool isCN = false; + ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val); + if (CN && (int32_t)CN->getSExtValue() == CN->getSExtValue()) { + isCN = true; + Val = CurDAG->getTargetConstant(CN->getSExtValue(), NVT); + } + + unsigned Opc = 0; + switch (NVT.getSimpleVT().SimpleTy) { + default: return 0; + case MVT::i8: + if (isCN) + Opc = AtomicOpcTbl[Op][ConstantI8]; + else + Opc = AtomicOpcTbl[Op][I8]; + break; + case MVT::i16: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI16]; + else + Opc = AtomicOpcTbl[Op][ConstantI16]; + } else + Opc = AtomicOpcTbl[Op][I16]; + break; + case MVT::i32: + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI32]; + else + Opc = AtomicOpcTbl[Op][ConstantI32]; + } else + Opc = AtomicOpcTbl[Op][I32]; + break; + case MVT::i64: + Opc = AtomicOpcTbl[Op][I64]; + if (isCN) { + if (immSext8(Val.getNode())) + Opc = AtomicOpcTbl[Op][SextConstantI64]; + else if (i64immSExt32(Val.getNode())) + Opc = AtomicOpcTbl[Op][ConstantI64]; + } + break; + } + + assert(Opc != 0 && "Invalid arith lock transform!"); + + DebugLoc dl = Node->getDebugLoc(); + SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, NVT), 0); + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain }; + SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0); + cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); + SDValue RetVals[] = { Undef, Ret }; + return CurDAG->getMergeValues(RetVals, 2, dl).getNode(); +} + +/// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has +/// any uses which require the SF or OF bits to be accurate. +static bool HasNoSignedComparisonUses(SDNode *N) { + // Examine each user of the node. + for (SDNode::use_iterator UI = N->use_begin(), + UE = N->use_end(); UI != UE; ++UI) { + // Only examine CopyToReg uses. + if (UI->getOpcode() != ISD::CopyToReg) + return false; + // Only examine CopyToReg uses that copy to EFLAGS. + if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != + X86::EFLAGS) + return false; + // Examine each user of the CopyToReg use. + for (SDNode::use_iterator FlagUI = UI->use_begin(), + FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { + // Only examine the Flag result. + if (FlagUI.getUse().getResNo() != 1) continue; + // Anything unusual: assume conservatively. + if (!FlagUI->isMachineOpcode()) return false; + // Examine the opcode of the user. + switch (FlagUI->getMachineOpcode()) { + // These comparisons don't treat the most significant bit specially. + case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr: + case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr: + case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm: + case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm: + case X86::JA_4: case X86::JAE_4: case X86::JB_4: case X86::JBE_4: + case X86::JE_4: case X86::JNE_4: case X86::JP_4: case X86::JNP_4: + case X86::CMOVA16rr: case X86::CMOVA16rm: + case X86::CMOVA32rr: case X86::CMOVA32rm: + case X86::CMOVA64rr: case X86::CMOVA64rm: + case X86::CMOVAE16rr: case X86::CMOVAE16rm: + case X86::CMOVAE32rr: case X86::CMOVAE32rm: + case X86::CMOVAE64rr: case X86::CMOVAE64rm: + case X86::CMOVB16rr: case X86::CMOVB16rm: + case X86::CMOVB32rr: case X86::CMOVB32rm: + case X86::CMOVB64rr: case X86::CMOVB64rm: + case X86::CMOVBE16rr: case X86::CMOVBE16rm: + case X86::CMOVBE32rr: case X86::CMOVBE32rm: + case X86::CMOVBE64rr: case X86::CMOVBE64rm: + case X86::CMOVE16rr: case X86::CMOVE16rm: + case X86::CMOVE32rr: case X86::CMOVE32rm: + case X86::CMOVE64rr: case X86::CMOVE64rm: + case X86::CMOVNE16rr: case X86::CMOVNE16rm: + case X86::CMOVNE32rr: case X86::CMOVNE32rm: + case X86::CMOVNE64rr: case X86::CMOVNE64rm: + case X86::CMOVNP16rr: case X86::CMOVNP16rm: + case X86::CMOVNP32rr: case X86::CMOVNP32rm: + case X86::CMOVNP64rr: case X86::CMOVNP64rm: + case X86::CMOVP16rr: case X86::CMOVP16rm: + case X86::CMOVP32rr: case X86::CMOVP32rm: + case X86::CMOVP64rr: case X86::CMOVP64rm: + continue; + // Anything else: assume conservatively. + default: return false; + } + } + } + return true; +} + +SDNode *X86DAGToDAGISel::Select(SDNode *Node) { + EVT NVT = Node->getValueType(0); + unsigned Opc, MOpc; + unsigned Opcode = Node->getOpcode(); + DebugLoc dl = Node->getDebugLoc(); + + DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n'); + + if (Node->isMachineOpcode()) { + DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n'); + return NULL; // Already selected. + } + + switch (Opcode) { + default: break; + case X86ISD::GlobalBaseReg: + return getGlobalBaseReg(); + + case X86ISD::ATOMOR64_DAG: + return SelectAtomic64(Node, X86::ATOMOR6432); + case X86ISD::ATOMXOR64_DAG: + return SelectAtomic64(Node, X86::ATOMXOR6432); + case X86ISD::ATOMADD64_DAG: + return SelectAtomic64(Node, X86::ATOMADD6432); + case X86ISD::ATOMSUB64_DAG: + return SelectAtomic64(Node, X86::ATOMSUB6432); + case X86ISD::ATOMNAND64_DAG: + return SelectAtomic64(Node, X86::ATOMNAND6432); + case X86ISD::ATOMAND64_DAG: + return SelectAtomic64(Node, X86::ATOMAND6432); + case X86ISD::ATOMSWAP64_DAG: + return SelectAtomic64(Node, X86::ATOMSWAP6432); + + case ISD::ATOMIC_LOAD_ADD: { + SDNode *RetVal = SelectAtomicLoadAdd(Node, NVT); + if (RetVal) + return RetVal; + break; + } + case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_AND: + case ISD::ATOMIC_LOAD_OR: { + SDNode *RetVal = SelectAtomicLoadArith(Node, NVT); + if (RetVal) + return RetVal; + break; + } + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + // For operations of the form (x << C1) op C2, check if we can use a smaller + // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse()) + break; + + // i8 is unshrinkable, i16 should be promoted to i32. + if (NVT != MVT::i32 && NVT != MVT::i64) + break; + + ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1); + ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + if (!Cst || !ShlCst) + break; + + int64_t Val = Cst->getSExtValue(); + uint64_t ShlVal = ShlCst->getZExtValue(); + + // Make sure that we don't change the operation by removing bits. + // This only matters for OR and XOR, AND is unaffected. + if (Opcode != ISD::AND && ((Val >> ShlVal) << ShlVal) != Val) + break; + + unsigned ShlOp, Op = 0; + EVT CstVT = NVT; + + // Check the minimum bitwidth for the new constant. + // TODO: AND32ri is the same as AND64ri32 with zext imm. + // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr + // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. + if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal)) + CstVT = MVT::i8; + else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal)) + CstVT = MVT::i32; + + // Bail if there is no smaller encoding. + if (NVT == CstVT) + break; + + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i32: + assert(CstVT == MVT::i8); + ShlOp = X86::SHL32ri; + + switch (Opcode) { + case ISD::AND: Op = X86::AND32ri8; break; + case ISD::OR: Op = X86::OR32ri8; break; + case ISD::XOR: Op = X86::XOR32ri8; break; + } + break; + case MVT::i64: + assert(CstVT == MVT::i8 || CstVT == MVT::i32); + ShlOp = X86::SHL64ri; + + switch (Opcode) { + case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break; + case ISD::OR: Op = CstVT==MVT::i8? X86::OR64ri8 : X86::OR64ri32; break; + case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break; + } + break; + } + + // Emit the smaller op and the shift. + SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, CstVT); + SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst); + return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0), + getI8Imm(ShlVal)); + break; + } + case X86ISD::UMUL: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + unsigned LoReg; + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: LoReg = X86::AL; Opc = X86::MUL8r; break; + case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break; + case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break; + case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break; + } + + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, + N0, SDValue()).getValue(1); + + SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); + SDValue Ops[] = {N1, InFlag}; + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops, 2); + + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); + ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1)); + ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2)); + return NULL; + } + + case ISD::SMUL_LOHI: + case ISD::UMUL_LOHI: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + bool isSigned = Opcode == ISD::SMUL_LOHI; + if (!isSigned) { + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break; + case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break; + case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break; + case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break; + } + } else { + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break; + case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break; + case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break; + case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break; + } + } + + unsigned LoReg, HiReg; + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: LoReg = X86::AL; HiReg = X86::AH; break; + case MVT::i16: LoReg = X86::AX; HiReg = X86::DX; break; + case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break; + case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break; + } + + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + // Multiply is commmutative. + if (!foldedLoad) { + foldedLoad = TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + if (foldedLoad) + std::swap(N0, N1); + } + + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, + N0, SDValue()).getValue(1); + + if (foldedLoad) { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), + InFlag }; + SDNode *CNode = + CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops, + array_lengthof(Ops)); + InFlag = SDValue(CNode, 1); + + // Update the chain. + ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); + } else { + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag); + InFlag = SDValue(CNode, 0); + } + + // Prevent use of AH in a REX instruction by referencing AX instead. + if (HiReg == X86::AH && Subtarget->is64Bit() && + !SDValue(Node, 1).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + X86::AX, MVT::i16, InFlag); + InFlag = Result.getValue(2); + // Get the low part if needed. Don't use getCopyFromReg for aliasing + // registers. + if (!SDValue(Node, 0).use_empty()) + ReplaceUses(SDValue(Node, 1), + CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); + + // Shift AX down 8 bits. + Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16, + Result, + CurDAG->getTargetConstant(8, MVT::i8)), 0); + // Then truncate it down to i8. + ReplaceUses(SDValue(Node, 1), + CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); + } + // Copy the low half of the result, if it is needed. + if (!SDValue(Node, 0).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + LoReg, NVT, InFlag); + InFlag = Result.getValue(2); + ReplaceUses(SDValue(Node, 0), Result); + DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + } + // Copy the high half of the result, if it is needed. + if (!SDValue(Node, 1).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + HiReg, NVT, InFlag); + InFlag = Result.getValue(2); + ReplaceUses(SDValue(Node, 1), Result); + DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + } + + return NULL; + } + + case ISD::SDIVREM: + case ISD::UDIVREM: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + bool isSigned = Opcode == ISD::SDIVREM; + if (!isSigned) { + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break; + case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break; + case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break; + case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break; + } + } else { + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break; + case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break; + case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break; + case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break; + } + } + + unsigned LoReg, HiReg, ClrReg; + unsigned ClrOpcode, SExtOpcode; + switch (NVT.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i8: + LoReg = X86::AL; ClrReg = HiReg = X86::AH; + ClrOpcode = 0; + SExtOpcode = X86::CBW; + break; + case MVT::i16: + LoReg = X86::AX; HiReg = X86::DX; + ClrOpcode = X86::MOV16r0; ClrReg = X86::DX; + SExtOpcode = X86::CWD; + break; + case MVT::i32: + LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; + ClrOpcode = X86::MOV32r0; + SExtOpcode = X86::CDQ; + break; + case MVT::i64: + LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; + ClrOpcode = X86::MOV64r0; + SExtOpcode = X86::CQO; + break; + } + + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + bool signBitIsZero = CurDAG->SignBitIsZero(N0); + + SDValue InFlag; + if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) { + // Special case for div8, just use a move with zero extension to AX to + // clear the upper 8 bits (AH). + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain; + if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; + Move = + SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32, + MVT::Other, Ops, + array_lengthof(Ops)), 0); + Chain = Move.getValue(1); + ReplaceUses(N0.getValue(1), Chain); + } else { + Move = + SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0); + Chain = CurDAG->getEntryNode(); + } + Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue()); + InFlag = Chain.getValue(1); + } else { + InFlag = + CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, + LoReg, N0, SDValue()).getValue(1); + if (isSigned && !signBitIsZero) { + // Sign extend the low part into the high part. + InFlag = + SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); + } else { + // Zero out the high part, effectively zero extending the input. + SDValue ClrNode = + SDValue(CurDAG->getMachineNode(ClrOpcode, dl, NVT), 0); + InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg, + ClrNode, InFlag).getValue(1); + } + } + + if (foldedLoad) { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), + InFlag }; + SDNode *CNode = + CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops, + array_lengthof(Ops)); + InFlag = SDValue(CNode, 1); + // Update the chain. + ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); + } else { + InFlag = + SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0); + } + + // Prevent use of AH in a REX instruction by referencing AX instead. + // Shift it down 8 bits. + if (HiReg == X86::AH && Subtarget->is64Bit() && + !SDValue(Node, 1).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + X86::AX, MVT::i16, InFlag); + InFlag = Result.getValue(2); + + // If we also need AL (the quotient), get it by extracting a subreg from + // Result. The fast register allocator does not like multiple CopyFromReg + // nodes using aliasing registers. + if (!SDValue(Node, 0).use_empty()) + ReplaceUses(SDValue(Node, 0), + CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); + + // Shift AX right by 8 bits instead of using AH. + Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16, + Result, + CurDAG->getTargetConstant(8, MVT::i8)), + 0); + ReplaceUses(SDValue(Node, 1), + CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); + } + // Copy the division (low) result, if it is needed. + if (!SDValue(Node, 0).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + LoReg, NVT, InFlag); + InFlag = Result.getValue(2); + ReplaceUses(SDValue(Node, 0), Result); + DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + } + // Copy the remainder (high) result, if it is needed. + if (!SDValue(Node, 1).use_empty()) { + SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, + HiReg, NVT, InFlag); + InFlag = Result.getValue(2); + ReplaceUses(SDValue(Node, 1), Result); + DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + } + return NULL; + } + + case X86ISD::CMP: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to + // use a smaller encoding. + if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && + HasNoSignedComparisonUses(Node)) + // Look past the truncate if CMP is the only use of it. + N0 = N0.getOperand(0); + if (N0.getNode()->getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && + N0.getValueType() != MVT::i8 && + X86::isZeroNode(N1)) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1)); + if (!C) break; + + // For example, convert "testl %eax, $8" to "testb %al, $8" + if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 && + (!(C->getZExtValue() & 0x80) || + HasNoSignedComparisonUses(Node))) { + SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i8); + SDValue Reg = N0.getNode()->getOperand(0); + + // On x86-32, only the ABCD registers have 8-bit subregisters. + if (!Subtarget->is64Bit()) { + TargetRegisterClass *TRC = 0; + switch (N0.getValueType().getSimpleVT().SimpleTy) { + case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break; + case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break; + default: llvm_unreachable("Unsupported TEST operand type!"); + } + SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32); + Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl, + Reg.getValueType(), Reg, RC), 0); + } + + // Extract the l-register. + SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, + MVT::i8, Reg); + + // Emit a testb. + return CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, Subreg, Imm); + } + + // For example, "testl %eax, $2048" to "testb %ah, $8". + if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 && + (!(C->getZExtValue() & 0x8000) || + HasNoSignedComparisonUses(Node))) { + // Shift the immediate right by 8 bits. + SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8, + MVT::i8); + SDValue Reg = N0.getNode()->getOperand(0); + + // Put the value in an ABCD register. + TargetRegisterClass *TRC = 0; + switch (N0.getValueType().getSimpleVT().SimpleTy) { + case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break; + case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break; + case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break; + default: llvm_unreachable("Unsupported TEST operand type!"); + } + SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32); + Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl, + Reg.getValueType(), Reg, RC), 0); + + // Extract the h-register. + SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, + MVT::i8, Reg); + + // Emit a testb. The EXTRACT_SUBREG becomes a COPY that can only + // target GR8_NOREX registers, so make sure the register class is + // forced. + return CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl, MVT::i32, + Subreg, ShiftedImm); + } + + // For example, "testl %eax, $32776" to "testw %ax, $32776". + if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 && + N0.getValueType() != MVT::i16 && + (!(C->getZExtValue() & 0x8000) || + HasNoSignedComparisonUses(Node))) { + SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i16); + SDValue Reg = N0.getNode()->getOperand(0); + + // Extract the 16-bit subregister. + SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, + MVT::i16, Reg); + + // Emit a testw. + return CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32, Subreg, Imm); + } + + // For example, "testq %rax, $268468232" to "testl %eax, $268468232". + if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 && + N0.getValueType() == MVT::i64 && + (!(C->getZExtValue() & 0x80000000) || + HasNoSignedComparisonUses(Node))) { + SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32); + SDValue Reg = N0.getNode()->getOperand(0); + + // Extract the 32-bit subregister. + SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl, + MVT::i32, Reg); + + // Emit a testl. + return CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32, Subreg, Imm); + } + } + break; + } + } + + SDNode *ResNode = SelectCode(Node); + + DEBUG(dbgs() << "=> "; + if (ResNode == NULL || ResNode == Node) + Node->dump(CurDAG); + else + ResNode->dump(CurDAG); + dbgs() << '\n'); + + return ResNode; +} + +bool X86DAGToDAGISel:: +SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode, + std::vector<SDValue> &OutOps) { + SDValue Op0, Op1, Op2, Op3, Op4; + switch (ConstraintCode) { + case 'o': // offsetable ?? + case 'v': // not offsetable ?? + default: return true; + case 'm': // memory + if (!SelectAddr(0, Op, Op0, Op1, Op2, Op3, Op4)) + return true; + break; + } + + OutOps.push_back(Op0); + OutOps.push_back(Op1); + OutOps.push_back(Op2); + OutOps.push_back(Op3); + OutOps.push_back(Op4); + return false; +} + +/// createX86ISelDag - This pass converts a legalized DAG into a +/// X86-specific DAG, ready for instruction scheduling. +/// +FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, + llvm::CodeGenOpt::Level OptLevel) { + return new X86DAGToDAGISel(TM, OptLevel); +} diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp new file mode 100644 index 0000000..7c8ce17 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -0,0 +1,14965 @@ +//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that X86 uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-isel" +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86ISelLowering.h" +#include "X86TargetMachine.h" +#include "X86TargetObjectFile.h" +#include "Utils/X86ShuffleDecode.h" +#include "llvm/CallingConv.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/GlobalAlias.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/CodeGen/IntrinsicLowering.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/VectorExtras.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Dwarf.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; +using namespace dwarf; + +STATISTIC(NumTailCalls, "Number of tail calls"); + +// Forward declarations. +static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, + SDValue V2); + +static SDValue Insert128BitVector(SDValue Result, + SDValue Vec, + SDValue Idx, + SelectionDAG &DAG, + DebugLoc dl); + +static SDValue Extract128BitVector(SDValue Vec, + SDValue Idx, + SelectionDAG &DAG, + DebugLoc dl); + +/// Generate a DAG to grab 128-bits from a vector > 128 bits. This +/// sets things up to match to an AVX VEXTRACTF128 instruction or a +/// simple subregister reference. Idx is an index in the 128 bits we +/// want. It need not be aligned to a 128-bit bounday. That makes +/// lowering EXTRACT_VECTOR_ELT operations easier. +static SDValue Extract128BitVector(SDValue Vec, + SDValue Idx, + SelectionDAG &DAG, + DebugLoc dl) { + EVT VT = Vec.getValueType(); + assert(VT.getSizeInBits() == 256 && "Unexpected vector size!"); + EVT ElVT = VT.getVectorElementType(); + int Factor = VT.getSizeInBits()/128; + EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, + VT.getVectorNumElements()/Factor); + + // Extract from UNDEF is UNDEF. + if (Vec.getOpcode() == ISD::UNDEF) + return DAG.getNode(ISD::UNDEF, dl, ResultVT); + + if (isa<ConstantSDNode>(Idx)) { + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + + // Extract the relevant 128 bits. Generate an EXTRACT_SUBVECTOR + // we can match to VEXTRACTF128. + unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits(); + + // This is the index of the first element of the 128-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) + * ElemsPerChunk); + + SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, + VecIdx); + + return Result; + } + + return SDValue(); +} + +/// Generate a DAG to put 128-bits into a vector > 128 bits. This +/// sets things up to match to an AVX VINSERTF128 instruction or a +/// simple superregister reference. Idx is an index in the 128 bits +/// we want. It need not be aligned to a 128-bit bounday. That makes +/// lowering INSERT_VECTOR_ELT operations easier. +static SDValue Insert128BitVector(SDValue Result, + SDValue Vec, + SDValue Idx, + SelectionDAG &DAG, + DebugLoc dl) { + if (isa<ConstantSDNode>(Idx)) { + EVT VT = Vec.getValueType(); + assert(VT.getSizeInBits() == 128 && "Unexpected vector size!"); + + EVT ElVT = VT.getVectorElementType(); + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + EVT ResultVT = Result.getValueType(); + + // Insert the relevant 128 bits. + unsigned ElemsPerChunk = 128/ElVT.getSizeInBits(); + + // This is the index of the first element of the 128-bit chunk + // we want. + unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) + * ElemsPerChunk); + + SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, + VecIdx); + return Result; + } + + return SDValue(); +} + +static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { + const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); + bool is64Bit = Subtarget->is64Bit(); + + if (Subtarget->isTargetEnvMacho()) { + if (is64Bit) + return new X8664_MachoTargetObjectFile(); + return new TargetLoweringObjectFileMachO(); + } + + if (Subtarget->isTargetELF()) + return new TargetLoweringObjectFileELF(); + if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) + return new TargetLoweringObjectFileCOFF(); + llvm_unreachable("unknown subtarget type"); +} + +X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) + : TargetLowering(TM, createTLOF(TM)) { + Subtarget = &TM.getSubtarget<X86Subtarget>(); + X86ScalarSSEf64 = Subtarget->hasXMMInt(); + X86ScalarSSEf32 = Subtarget->hasXMM(); + X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; + + RegInfo = TM.getRegisterInfo(); + TD = getTargetData(); + + // Set up the TargetLowering object. + static MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; + + // X86 is weird, it always uses i8 for shift amounts and setcc results. + setBooleanContents(ZeroOrOneBooleanContent); + // X86-SSE is even stranger. It uses -1 or 0 for vector masks. + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + + // For 64-bit since we have so many registers use the ILP scheduler, for + // 32-bit code use the register pressure specific scheduling. + if (Subtarget->is64Bit()) + setSchedulingPreference(Sched::ILP); + else + setSchedulingPreference(Sched::RegPressure); + setStackPointerRegisterToSaveRestore(X86StackPtr); + + if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { + // Setup Windows compiler runtime calls. + setLibcallName(RTLIB::SDIV_I64, "_alldiv"); + setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); + setLibcallName(RTLIB::SREM_I64, "_allrem"); + setLibcallName(RTLIB::UREM_I64, "_aullrem"); + setLibcallName(RTLIB::MUL_I64, "_allmul"); + setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); + setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); + setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); + setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); + setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); + setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); + setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); + setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); + setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); + } + + if (Subtarget->isTargetDarwin()) { + // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. + setUseUnderscoreSetJmp(false); + setUseUnderscoreLongJmp(false); + } else if (Subtarget->isTargetMingw()) { + // MS runtime is weird: it exports _setjmp, but longjmp! + setUseUnderscoreSetJmp(true); + setUseUnderscoreLongJmp(false); + } else { + setUseUnderscoreSetJmp(true); + setUseUnderscoreLongJmp(true); + } + + // Set up the register classes. + addRegisterClass(MVT::i8, X86::GR8RegisterClass); + addRegisterClass(MVT::i16, X86::GR16RegisterClass); + addRegisterClass(MVT::i32, X86::GR32RegisterClass); + if (Subtarget->is64Bit()) + addRegisterClass(MVT::i64, X86::GR64RegisterClass); + + setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); + + // We don't accept any truncstore of integer registers. + setTruncStoreAction(MVT::i64, MVT::i32, Expand); + setTruncStoreAction(MVT::i64, MVT::i16, Expand); + setTruncStoreAction(MVT::i64, MVT::i8 , Expand); + setTruncStoreAction(MVT::i32, MVT::i16, Expand); + setTruncStoreAction(MVT::i32, MVT::i8 , Expand); + setTruncStoreAction(MVT::i16, MVT::i8, Expand); + + // SETOEQ and SETUNE require checking two conditions. + setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); + setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); + setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); + setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); + setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); + setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); + + // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this + // operation. + setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); + setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); + setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); + + if (Subtarget->is64Bit()) { + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); + setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); + } else if (!UseSoftFloat) { + // We have an algorithm for SSE2->double, and we turn this into a + // 64-bit FILD followed by conditional FADD for other targets. + setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); + // We have an algorithm for SSE2, and we turn this into a 64-bit + // FILD for other targets. + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); + } + + // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have + // this operation. + setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); + setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); + + if (!UseSoftFloat) { + // SSE has no i16 to fp conversion, only i32 + if (X86ScalarSSEf32) { + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); + // f32 and f64 cases are Legal, f80 case is not + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + } else { + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); + } + } else { + setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); + setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); + } + + // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 + // are Legal, f80 is custom lowered. + setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); + setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); + + // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have + // this operation. + setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); + setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); + + if (X86ScalarSSEf32) { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); + // f32 and f64 cases are Legal, f80 case is not + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + } else { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + } + + // Handle FP_TO_UINT by promoting the destination to a larger signed + // conversion. + setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); + setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); + setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); + + if (Subtarget->is64Bit()) { + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); + } else if (!UseSoftFloat) { + // Since AVX is a superset of SSE3, only check for SSE here. + if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) + // Expand FP_TO_UINT into a select. + // FIXME: We would like to use a Custom expander here eventually to do + // the optimal thing for SSE vs. the default expansion in the legalizer. + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); + else + // With SSE3 we can use fisttpll to convert to a signed i64; without + // SSE, we're stuck with a fistpll. + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); + } + + // TODO: when we have SSE, these could be more efficient, by using movd/movq. + if (!X86ScalarSSEf64) { + setOperationAction(ISD::BITCAST , MVT::f32 , Expand); + setOperationAction(ISD::BITCAST , MVT::i32 , Expand); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::BITCAST , MVT::f64 , Expand); + // Without SSE, i64->f64 goes through memory. + setOperationAction(ISD::BITCAST , MVT::i64 , Expand); + } + } + + // Scalar integer divide and remainder are lowered to use operations that + // produce two results, to match the available instructions. This exposes + // the two-result form to trivial CSE, which is able to combine x/y and x%y + // into a single instruction. + // + // Scalar integer multiply-high is also lowered to use two-result + // operations, to match the available instructions. However, plain multiply + // (low) operations are left as Legal, as there are single-result + // instructions for this in x86. Using the two-result multiply instructions + // when both high and low results are needed must be arranged by dagcombine. + for (unsigned i = 0, e = 4; i != e; ++i) { + MVT VT = IntVTs[i]; + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + + // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. + setOperationAction(ISD::ADDC, VT, Custom); + setOperationAction(ISD::ADDE, VT, Custom); + setOperationAction(ISD::SUBC, VT, Custom); + setOperationAction(ISD::SUBE, VT, Custom); + } + + setOperationAction(ISD::BR_JT , MVT::Other, Expand); + setOperationAction(ISD::BRCOND , MVT::Other, Custom); + setOperationAction(ISD::BR_CC , MVT::Other, Expand); + setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); + setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); + setOperationAction(ISD::FREM , MVT::f32 , Expand); + setOperationAction(ISD::FREM , MVT::f64 , Expand); + setOperationAction(ISD::FREM , MVT::f80 , Expand); + setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); + + if (Subtarget->hasBMI()) { + setOperationAction(ISD::CTTZ , MVT::i8 , Promote); + } else { + setOperationAction(ISD::CTTZ , MVT::i8 , Custom); + setOperationAction(ISD::CTTZ , MVT::i16 , Custom); + setOperationAction(ISD::CTTZ , MVT::i32 , Custom); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTTZ , MVT::i64 , Custom); + } + + if (Subtarget->hasLZCNT()) { + setOperationAction(ISD::CTLZ , MVT::i8 , Promote); + } else { + setOperationAction(ISD::CTLZ , MVT::i8 , Custom); + setOperationAction(ISD::CTLZ , MVT::i16 , Custom); + setOperationAction(ISD::CTLZ , MVT::i32 , Custom); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTLZ , MVT::i64 , Custom); + } + + if (Subtarget->hasPOPCNT()) { + setOperationAction(ISD::CTPOP , MVT::i8 , Promote); + } else { + setOperationAction(ISD::CTPOP , MVT::i8 , Expand); + setOperationAction(ISD::CTPOP , MVT::i16 , Expand); + setOperationAction(ISD::CTPOP , MVT::i32 , Expand); + if (Subtarget->is64Bit()) + setOperationAction(ISD::CTPOP , MVT::i64 , Expand); + } + + setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); + setOperationAction(ISD::BSWAP , MVT::i16 , Expand); + + // These should be promoted to a larger select which is supported. + setOperationAction(ISD::SELECT , MVT::i1 , Promote); + // X86 wants to expand cmov itself. + setOperationAction(ISD::SELECT , MVT::i8 , Custom); + setOperationAction(ISD::SELECT , MVT::i16 , Custom); + setOperationAction(ISD::SELECT , MVT::i32 , Custom); + setOperationAction(ISD::SELECT , MVT::f32 , Custom); + setOperationAction(ISD::SELECT , MVT::f64 , Custom); + setOperationAction(ISD::SELECT , MVT::f80 , Custom); + setOperationAction(ISD::SETCC , MVT::i8 , Custom); + setOperationAction(ISD::SETCC , MVT::i16 , Custom); + setOperationAction(ISD::SETCC , MVT::i32 , Custom); + setOperationAction(ISD::SETCC , MVT::f32 , Custom); + setOperationAction(ISD::SETCC , MVT::f64 , Custom); + setOperationAction(ISD::SETCC , MVT::f80 , Custom); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::SELECT , MVT::i64 , Custom); + setOperationAction(ISD::SETCC , MVT::i64 , Custom); + } + setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); + + // Darwin ABI issue. + setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); + setOperationAction(ISD::JumpTable , MVT::i32 , Custom); + setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); + if (Subtarget->is64Bit()) + setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); + setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); + setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); + setOperationAction(ISD::JumpTable , MVT::i64 , Custom); + setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); + setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); + setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); + } + // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) + setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); + setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); + setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); + setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); + setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); + } + + if (Subtarget->hasXMM()) + setOperationAction(ISD::PREFETCH , MVT::Other, Legal); + + setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); + setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); + + // On X86 and X86-64, atomic operations are lowered to locked instructions. + // Locked instructions, in turn, have implicit fence semantics (all memory + // operations are flushed before issuing the locked instruction, and they + // are not buffered), so we can fold away the common pattern of + // fence-atomic-fence. + setShouldFoldAtomicFences(true); + + // Expand certain atomics + for (unsigned i = 0, e = 4; i != e; ++i) { + MVT VT = IntVTs[i]; + setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); + setOperationAction(ISD::ATOMIC_STORE, VT, Custom); + } + + if (!Subtarget->is64Bit()) { + setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); + } + + if (Subtarget->hasCmpxchg16b()) { + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); + } + + // FIXME - use subtarget debug flags + if (!Subtarget->isTargetDarwin() && + !Subtarget->isTargetELF() && + !Subtarget->isTargetCygMing()) { + setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); + } + + setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); + setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); + setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); + setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); + if (Subtarget->is64Bit()) { + setExceptionPointerRegister(X86::RAX); + setExceptionSelectorRegister(X86::RDX); + } else { + setExceptionPointerRegister(X86::EAX); + setExceptionSelectorRegister(X86::EDX); + } + setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); + setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); + + setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); + setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); + + setOperationAction(ISD::TRAP, MVT::Other, Legal); + + // VASTART needs to be custom lowered to use the VarArgsFrameIndex + setOperationAction(ISD::VASTART , MVT::Other, Custom); + setOperationAction(ISD::VAEND , MVT::Other, Expand); + if (Subtarget->is64Bit()) { + setOperationAction(ISD::VAARG , MVT::Other, Custom); + setOperationAction(ISD::VACOPY , MVT::Other, Custom); + } else { + setOperationAction(ISD::VAARG , MVT::Other, Expand); + setOperationAction(ISD::VACOPY , MVT::Other, Expand); + } + + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + + if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) + setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? + MVT::i64 : MVT::i32, Custom); + else if (EnableSegmentedStacks) + setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? + MVT::i64 : MVT::i32, Custom); + else + setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? + MVT::i64 : MVT::i32, Expand); + + if (!UseSoftFloat && X86ScalarSSEf64) { + // f32 and f64 use SSE. + // Set up the FP register classes. + addRegisterClass(MVT::f32, X86::FR32RegisterClass); + addRegisterClass(MVT::f64, X86::FR64RegisterClass); + + // Use ANDPD to simulate FABS. + setOperationAction(ISD::FABS , MVT::f64, Custom); + setOperationAction(ISD::FABS , MVT::f32, Custom); + + // Use XORP to simulate FNEG. + setOperationAction(ISD::FNEG , MVT::f64, Custom); + setOperationAction(ISD::FNEG , MVT::f32, Custom); + + // Use ANDPD and ORPD to simulate FCOPYSIGN. + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + + // Lower this to FGETSIGNx86 plus an AND. + setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); + setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); + + // We don't support sin/cos/fmod + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + + // Expand FP immediates into loads from the stack, except for the special + // cases we handle. + addLegalFPImmediate(APFloat(+0.0)); // xorpd + addLegalFPImmediate(APFloat(+0.0f)); // xorps + } else if (!UseSoftFloat && X86ScalarSSEf32) { + // Use SSE for f32, x87 for f64. + // Set up the FP register classes. + addRegisterClass(MVT::f32, X86::FR32RegisterClass); + addRegisterClass(MVT::f64, X86::RFP64RegisterClass); + + // Use ANDPS to simulate FABS. + setOperationAction(ISD::FABS , MVT::f32, Custom); + + // Use XORP to simulate FNEG. + setOperationAction(ISD::FNEG , MVT::f32, Custom); + + setOperationAction(ISD::UNDEF, MVT::f64, Expand); + + // Use ANDPS and ORPS to simulate FCOPYSIGN. + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + + // We don't support sin/cos/fmod + setOperationAction(ISD::FSIN , MVT::f32, Expand); + setOperationAction(ISD::FCOS , MVT::f32, Expand); + + // Special cases we handle for FP constants. + addLegalFPImmediate(APFloat(+0.0f)); // xorps + addLegalFPImmediate(APFloat(+0.0)); // FLD0 + addLegalFPImmediate(APFloat(+1.0)); // FLD1 + addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS + addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS + + if (!UnsafeFPMath) { + setOperationAction(ISD::FSIN , MVT::f64 , Expand); + setOperationAction(ISD::FCOS , MVT::f64 , Expand); + } + } else if (!UseSoftFloat) { + // f32 and f64 in x87. + // Set up the FP register classes. + addRegisterClass(MVT::f64, X86::RFP64RegisterClass); + addRegisterClass(MVT::f32, X86::RFP32RegisterClass); + + setOperationAction(ISD::UNDEF, MVT::f64, Expand); + setOperationAction(ISD::UNDEF, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + + if (!UnsafeFPMath) { + setOperationAction(ISD::FSIN , MVT::f64 , Expand); + setOperationAction(ISD::FCOS , MVT::f64 , Expand); + } + addLegalFPImmediate(APFloat(+0.0)); // FLD0 + addLegalFPImmediate(APFloat(+1.0)); // FLD1 + addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS + addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS + addLegalFPImmediate(APFloat(+0.0f)); // FLD0 + addLegalFPImmediate(APFloat(+1.0f)); // FLD1 + addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS + addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS + } + + // We don't support FMA. + setOperationAction(ISD::FMA, MVT::f64, Expand); + setOperationAction(ISD::FMA, MVT::f32, Expand); + + // Long double always uses X87. + if (!UseSoftFloat) { + addRegisterClass(MVT::f80, X86::RFP80RegisterClass); + setOperationAction(ISD::UNDEF, MVT::f80, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); + { + APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); + addLegalFPImmediate(TmpFlt); // FLD0 + TmpFlt.changeSign(); + addLegalFPImmediate(TmpFlt); // FLD0/FCHS + + bool ignored; + APFloat TmpFlt2(+1.0); + TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, + &ignored); + addLegalFPImmediate(TmpFlt2); // FLD1 + TmpFlt2.changeSign(); + addLegalFPImmediate(TmpFlt2); // FLD1/FCHS + } + + if (!UnsafeFPMath) { + setOperationAction(ISD::FSIN , MVT::f80 , Expand); + setOperationAction(ISD::FCOS , MVT::f80 , Expand); + } + + setOperationAction(ISD::FMA, MVT::f80, Expand); + } + + // Always use a library call for pow. + setOperationAction(ISD::FPOW , MVT::f32 , Expand); + setOperationAction(ISD::FPOW , MVT::f64 , Expand); + setOperationAction(ISD::FPOW , MVT::f80 , Expand); + + setOperationAction(ISD::FLOG, MVT::f80, Expand); + setOperationAction(ISD::FLOG2, MVT::f80, Expand); + setOperationAction(ISD::FLOG10, MVT::f80, Expand); + setOperationAction(ISD::FEXP, MVT::f80, Expand); + setOperationAction(ISD::FEXP2, MVT::f80, Expand); + + // First set operation action for all vector types to either promote + // (for widening) or expand (for scalarization). Then we will selectively + // turn on ones that can be effectively codegen'd. + for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { + setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); + setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); + setOperationAction(ISD::INSERT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); + setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SETCC, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); + setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::VSELECT, (MVT::SimpleValueType)VT, Expand); + for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) + setTruncStoreAction((MVT::SimpleValueType)VT, + (MVT::SimpleValueType)InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); + setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); + } + + // FIXME: In order to prevent SSE instructions being expanded to MMX ones + // with -msoft-float, disable use of MMX as well. + if (!UseSoftFloat && Subtarget->hasMMX()) { + addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); + // No operations on x86mmx supported, everything uses intrinsics. + } + + // MMX-sized vectors (other than x86mmx) are expected to be expanded + // into smaller operations. + setOperationAction(ISD::MULHS, MVT::v8i8, Expand); + setOperationAction(ISD::MULHS, MVT::v4i16, Expand); + setOperationAction(ISD::MULHS, MVT::v2i32, Expand); + setOperationAction(ISD::MULHS, MVT::v1i64, Expand); + setOperationAction(ISD::AND, MVT::v8i8, Expand); + setOperationAction(ISD::AND, MVT::v4i16, Expand); + setOperationAction(ISD::AND, MVT::v2i32, Expand); + setOperationAction(ISD::AND, MVT::v1i64, Expand); + setOperationAction(ISD::OR, MVT::v8i8, Expand); + setOperationAction(ISD::OR, MVT::v4i16, Expand); + setOperationAction(ISD::OR, MVT::v2i32, Expand); + setOperationAction(ISD::OR, MVT::v1i64, Expand); + setOperationAction(ISD::XOR, MVT::v8i8, Expand); + setOperationAction(ISD::XOR, MVT::v4i16, Expand); + setOperationAction(ISD::XOR, MVT::v2i32, Expand); + setOperationAction(ISD::XOR, MVT::v1i64, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); + setOperationAction(ISD::SELECT, MVT::v8i8, Expand); + setOperationAction(ISD::SELECT, MVT::v4i16, Expand); + setOperationAction(ISD::SELECT, MVT::v2i32, Expand); + setOperationAction(ISD::SELECT, MVT::v1i64, Expand); + setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); + setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); + setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); + setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); + + if (!UseSoftFloat && Subtarget->hasXMM()) { + addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); + + setOperationAction(ISD::FADD, MVT::v4f32, Legal); + setOperationAction(ISD::FSUB, MVT::v4f32, Legal); + setOperationAction(ISD::FMUL, MVT::v4f32, Legal); + setOperationAction(ISD::FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); + setOperationAction(ISD::FNEG, MVT::v4f32, Custom); + setOperationAction(ISD::LOAD, MVT::v4f32, Legal); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + setOperationAction(ISD::SELECT, MVT::v4f32, Custom); + setOperationAction(ISD::SETCC, MVT::v4f32, Custom); + } + + if (!UseSoftFloat && Subtarget->hasXMMInt()) { + addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); + + // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM + // registers cannot be used even for integer operations. + addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); + addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); + addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); + addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); + + setOperationAction(ISD::ADD, MVT::v16i8, Legal); + setOperationAction(ISD::ADD, MVT::v8i16, Legal); + setOperationAction(ISD::ADD, MVT::v4i32, Legal); + setOperationAction(ISD::ADD, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v2i64, Custom); + setOperationAction(ISD::SUB, MVT::v16i8, Legal); + setOperationAction(ISD::SUB, MVT::v8i16, Legal); + setOperationAction(ISD::SUB, MVT::v4i32, Legal); + setOperationAction(ISD::SUB, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v8i16, Legal); + setOperationAction(ISD::FADD, MVT::v2f64, Legal); + setOperationAction(ISD::FSUB, MVT::v2f64, Legal); + setOperationAction(ISD::FMUL, MVT::v2f64, Legal); + setOperationAction(ISD::FDIV, MVT::v2f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); + setOperationAction(ISD::FNEG, MVT::v2f64, Custom); + + setOperationAction(ISD::SETCC, MVT::v2i64, Custom); + setOperationAction(ISD::SETCC, MVT::v16i8, Custom); + setOperationAction(ISD::SETCC, MVT::v8i16, Custom); + setOperationAction(ISD::SETCC, MVT::v4i32, Custom); + + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); + + // Custom lower build_vector, vector_shuffle, and extract_vector_elt. + for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { + EVT VT = (MVT::SimpleValueType)i; + // Do not attempt to custom lower non-power-of-2 vectors + if (!isPowerOf2_32(VT.getVectorNumElements())) + continue; + // Do not attempt to custom lower non-128-bit vectors + if (!VT.is128BitVector()) + continue; + setOperationAction(ISD::BUILD_VECTOR, + VT.getSimpleVT().SimpleTy, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, + VT.getSimpleVT().SimpleTy, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, + VT.getSimpleVT().SimpleTy, Custom); + } + + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); + + if (Subtarget->is64Bit()) { + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); + } + + // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. + for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { + MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; + EVT VT = SVT; + + // Do not attempt to promote non-128-bit vectors + if (!VT.is128BitVector()) + continue; + + setOperationAction(ISD::AND, SVT, Promote); + AddPromotedToType (ISD::AND, SVT, MVT::v2i64); + setOperationAction(ISD::OR, SVT, Promote); + AddPromotedToType (ISD::OR, SVT, MVT::v2i64); + setOperationAction(ISD::XOR, SVT, Promote); + AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); + setOperationAction(ISD::LOAD, SVT, Promote); + AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); + setOperationAction(ISD::SELECT, SVT, Promote); + AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); + } + + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + + // Custom lower v2i64 and v2f64 selects. + setOperationAction(ISD::LOAD, MVT::v2f64, Legal); + setOperationAction(ISD::LOAD, MVT::v2i64, Legal); + setOperationAction(ISD::SELECT, MVT::v2f64, Custom); + setOperationAction(ISD::SELECT, MVT::v2i64, Custom); + + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + } + + if (Subtarget->hasSSE41() || Subtarget->hasAVX()) { + setOperationAction(ISD::FFLOOR, MVT::f32, Legal); + setOperationAction(ISD::FCEIL, MVT::f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + setOperationAction(ISD::FCEIL, MVT::f64, Legal); + setOperationAction(ISD::FTRUNC, MVT::f64, Legal); + setOperationAction(ISD::FRINT, MVT::f64, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); + + // FIXME: Do we need to handle scalar-to-vector here? + setOperationAction(ISD::MUL, MVT::v4i32, Legal); + + // Can turn SHL into an integer multiply. + setOperationAction(ISD::SHL, MVT::v4i32, Custom); + setOperationAction(ISD::SHL, MVT::v16i8, Custom); + + setOperationAction(ISD::VSELECT, MVT::v2f64, Legal); + setOperationAction(ISD::VSELECT, MVT::v2i64, Legal); + setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); + setOperationAction(ISD::VSELECT, MVT::v4i32, Legal); + setOperationAction(ISD::VSELECT, MVT::v4f32, Legal); + + // i8 and i16 vectors are custom , because the source register and source + // source memory operand types are not the same width. f32 vectors are + // custom since the immediate controlling the insert encodes additional + // information. + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + + if (Subtarget->is64Bit()) { + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); + } + } + + if (Subtarget->hasXMMInt()) { + setOperationAction(ISD::SRL, MVT::v2i64, Custom); + setOperationAction(ISD::SRL, MVT::v4i32, Custom); + setOperationAction(ISD::SRL, MVT::v16i8, Custom); + setOperationAction(ISD::SRL, MVT::v8i16, Custom); + + setOperationAction(ISD::SHL, MVT::v2i64, Custom); + setOperationAction(ISD::SHL, MVT::v4i32, Custom); + setOperationAction(ISD::SHL, MVT::v8i16, Custom); + + setOperationAction(ISD::SRA, MVT::v4i32, Custom); + setOperationAction(ISD::SRA, MVT::v8i16, Custom); + } + + if (Subtarget->hasSSE42() || Subtarget->hasAVX()) + setOperationAction(ISD::SETCC, MVT::v2i64, Custom); + + if (!UseSoftFloat && Subtarget->hasAVX()) { + addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); + addRegisterClass(MVT::v16i16, X86::VR256RegisterClass); + addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); + addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); + addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); + addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); + + setOperationAction(ISD::LOAD, MVT::v8f32, Legal); + setOperationAction(ISD::LOAD, MVT::v4f64, Legal); + setOperationAction(ISD::LOAD, MVT::v4i64, Legal); + + setOperationAction(ISD::FADD, MVT::v8f32, Legal); + setOperationAction(ISD::FSUB, MVT::v8f32, Legal); + setOperationAction(ISD::FMUL, MVT::v8f32, Legal); + setOperationAction(ISD::FDIV, MVT::v8f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); + setOperationAction(ISD::FNEG, MVT::v8f32, Custom); + + setOperationAction(ISD::FADD, MVT::v4f64, Legal); + setOperationAction(ISD::FSUB, MVT::v4f64, Legal); + setOperationAction(ISD::FMUL, MVT::v4f64, Legal); + setOperationAction(ISD::FDIV, MVT::v4f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); + setOperationAction(ISD::FNEG, MVT::v4f64, Custom); + + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i64, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i8, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i16, Custom); + + setOperationAction(ISD::SRL, MVT::v4i64, Custom); + setOperationAction(ISD::SRL, MVT::v8i32, Custom); + setOperationAction(ISD::SRL, MVT::v16i16, Custom); + setOperationAction(ISD::SRL, MVT::v32i8, Custom); + + setOperationAction(ISD::SHL, MVT::v4i64, Custom); + setOperationAction(ISD::SHL, MVT::v8i32, Custom); + setOperationAction(ISD::SHL, MVT::v16i16, Custom); + setOperationAction(ISD::SHL, MVT::v32i8, Custom); + + setOperationAction(ISD::SRA, MVT::v8i32, Custom); + setOperationAction(ISD::SRA, MVT::v16i16, Custom); + + setOperationAction(ISD::SETCC, MVT::v32i8, Custom); + setOperationAction(ISD::SETCC, MVT::v16i16, Custom); + setOperationAction(ISD::SETCC, MVT::v8i32, Custom); + setOperationAction(ISD::SETCC, MVT::v4i64, Custom); + + setOperationAction(ISD::SELECT, MVT::v4f64, Custom); + setOperationAction(ISD::SELECT, MVT::v4i64, Custom); + setOperationAction(ISD::SELECT, MVT::v8f32, Custom); + + setOperationAction(ISD::VSELECT, MVT::v4f64, Legal); + setOperationAction(ISD::VSELECT, MVT::v4i64, Legal); + setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); + setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + + setOperationAction(ISD::ADD, MVT::v4i64, Custom); + setOperationAction(ISD::ADD, MVT::v8i32, Custom); + setOperationAction(ISD::ADD, MVT::v16i16, Custom); + setOperationAction(ISD::ADD, MVT::v32i8, Custom); + + setOperationAction(ISD::SUB, MVT::v4i64, Custom); + setOperationAction(ISD::SUB, MVT::v8i32, Custom); + setOperationAction(ISD::SUB, MVT::v16i16, Custom); + setOperationAction(ISD::SUB, MVT::v32i8, Custom); + + setOperationAction(ISD::MUL, MVT::v4i64, Custom); + setOperationAction(ISD::MUL, MVT::v8i32, Custom); + setOperationAction(ISD::MUL, MVT::v16i16, Custom); + // Don't lower v32i8 because there is no 128-bit byte mul + + // Custom lower several nodes for 256-bit types. + for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) { + MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; + EVT VT = SVT; + + // Extract subvector is special because the value type + // (result) is 128-bit but the source is 256-bit wide. + if (VT.is128BitVector()) + setOperationAction(ISD::EXTRACT_SUBVECTOR, SVT, Custom); + + // Do not attempt to custom lower other non-256-bit vectors + if (!VT.is256BitVector()) + continue; + + setOperationAction(ISD::BUILD_VECTOR, SVT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, SVT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, SVT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, SVT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, SVT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, SVT, Custom); + } + + // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. + for (unsigned i = (unsigned)MVT::v32i8; i != (unsigned)MVT::v4i64; ++i) { + MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; + EVT VT = SVT; + + // Do not attempt to promote non-256-bit vectors + if (!VT.is256BitVector()) + continue; + + setOperationAction(ISD::AND, SVT, Promote); + AddPromotedToType (ISD::AND, SVT, MVT::v4i64); + setOperationAction(ISD::OR, SVT, Promote); + AddPromotedToType (ISD::OR, SVT, MVT::v4i64); + setOperationAction(ISD::XOR, SVT, Promote); + AddPromotedToType (ISD::XOR, SVT, MVT::v4i64); + setOperationAction(ISD::LOAD, SVT, Promote); + AddPromotedToType (ISD::LOAD, SVT, MVT::v4i64); + setOperationAction(ISD::SELECT, SVT, Promote); + AddPromotedToType (ISD::SELECT, SVT, MVT::v4i64); + } + } + + // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion + // of this type with custom code. + for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + VT != (unsigned)MVT::LAST_VECTOR_VALUETYPE; VT++) { + setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom); + } + + // We want to custom lower some of our intrinsics. + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + + + // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't + // handle type legalization for these operations here. + // + // FIXME: We really should do custom legalization for addition and + // subtraction on x86-32 once PR3203 is fixed. We really can't do much better + // than generic legalization for 64-bit multiplication-with-overflow, though. + for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { + // Add/Sub/Mul with overflow operations are custom lowered. + MVT VT = IntVTs[i]; + setOperationAction(ISD::SADDO, VT, Custom); + setOperationAction(ISD::UADDO, VT, Custom); + setOperationAction(ISD::SSUBO, VT, Custom); + setOperationAction(ISD::USUBO, VT, Custom); + setOperationAction(ISD::SMULO, VT, Custom); + setOperationAction(ISD::UMULO, VT, Custom); + } + + // There are no 8-bit 3-address imul/mul instructions + setOperationAction(ISD::SMULO, MVT::i8, Expand); + setOperationAction(ISD::UMULO, MVT::i8, Expand); + + if (!Subtarget->is64Bit()) { + // These libcalls are not available in 32-bit. + setLibcallName(RTLIB::SHL_I128, 0); + setLibcallName(RTLIB::SRL_I128, 0); + setLibcallName(RTLIB::SRA_I128, 0); + } + + // We have target-specific dag combine patterns for the following nodes: + setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::BUILD_VECTOR); + setTargetDAGCombine(ISD::VSELECT); + setTargetDAGCombine(ISD::SELECT); + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::OR); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::SUB); + setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::SINT_TO_FP); + if (Subtarget->is64Bit()) + setTargetDAGCombine(ISD::MUL); + + computeRegisterProperties(); + + // On Darwin, -Os means optimize for size without hurting performance, + // do not reduce the limit. + maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores + maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; + maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores + maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores + maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + setPrefLoopAlignment(16); + benefitFromCodePlacementOpt = true; + + setPrefFunctionAlignment(4); +} + + +EVT X86TargetLowering::getSetCCResultType(EVT VT) const { + if (!VT.isVector()) return MVT::i8; + return VT.changeVectorElementTypeToInteger(); +} + + +/// getMaxByValAlign - Helper for getByValTypeAlignment to determine +/// the desired ByVal argument alignment. +static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { + if (MaxAlign == 16) + return; + if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { + if (VTy->getBitWidth() == 128) + MaxAlign = 16; + } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { + unsigned EltAlign = 0; + getMaxByValAlign(ATy->getElementType(), EltAlign); + if (EltAlign > MaxAlign) + MaxAlign = EltAlign; + } else if (StructType *STy = dyn_cast<StructType>(Ty)) { + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + unsigned EltAlign = 0; + getMaxByValAlign(STy->getElementType(i), EltAlign); + if (EltAlign > MaxAlign) + MaxAlign = EltAlign; + if (MaxAlign == 16) + break; + } + } + return; +} + +/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate +/// function arguments in the caller parameter area. For X86, aggregates +/// that contain SSE vectors are placed at 16-byte boundaries while the rest +/// are at 4-byte boundaries. +unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { + if (Subtarget->is64Bit()) { + // Max of 8 and alignment of type. + unsigned TyAlign = TD->getABITypeAlignment(Ty); + if (TyAlign > 8) + return TyAlign; + return 8; + } + + unsigned Align = 4; + if (Subtarget->hasXMM()) + getMaxByValAlign(Ty, Align); + return Align; +} + +/// getOptimalMemOpType - Returns the target specific optimal type for load +/// and store operations as a result of memset, memcpy, and memmove +/// lowering. If DstAlign is zero that means it's safe to destination +/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it +/// means there isn't a need to check it against alignment requirement, +/// probably because the source does not need to be loaded. If +/// 'NonScalarIntSafe' is true, that means it's safe to return a +/// non-scalar-integer type, e.g. empty string source, constant, or loaded +/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is +/// constant so it does not need to be loaded. +/// It returns EVT::Other if the type should be determined using generic +/// target-independent logic. +EVT +X86TargetLowering::getOptimalMemOpType(uint64_t Size, + unsigned DstAlign, unsigned SrcAlign, + bool NonScalarIntSafe, + bool MemcpyStrSrc, + MachineFunction &MF) const { + // FIXME: This turns off use of xmm stores for memset/memcpy on targets like + // linux. This is because the stack realignment code can't handle certain + // cases like PR2962. This should be removed when PR2962 is fixed. + const Function *F = MF.getFunction(); + if (NonScalarIntSafe && + !F->hasFnAttr(Attribute::NoImplicitFloat)) { + if (Size >= 16 && + (Subtarget->isUnalignedMemAccessFast() || + ((DstAlign == 0 || DstAlign >= 16) && + (SrcAlign == 0 || SrcAlign >= 16))) && + Subtarget->getStackAlignment() >= 16) { + if (Subtarget->hasAVX() && + Subtarget->getStackAlignment() >= 32) + return MVT::v8f32; + if (Subtarget->hasXMMInt()) + return MVT::v4i32; + if (Subtarget->hasXMM()) + return MVT::v4f32; + } else if (!MemcpyStrSrc && Size >= 8 && + !Subtarget->is64Bit() && + Subtarget->getStackAlignment() >= 8 && + Subtarget->hasXMMInt()) { + // Do not use f64 to lower memcpy if source is string constant. It's + // better to use i32 to avoid the loads. + return MVT::f64; + } + } + if (Subtarget->is64Bit() && Size >= 8) + return MVT::i64; + return MVT::i32; +} + +/// getJumpTableEncoding - Return the entry encoding for a jump table in the +/// current function. The returned value is a member of the +/// MachineJumpTableInfo::JTEntryKind enum. +unsigned X86TargetLowering::getJumpTableEncoding() const { + // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF + // symbol. + if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && + Subtarget->isPICStyleGOT()) + return MachineJumpTableInfo::EK_Custom32; + + // Otherwise, use the normal jump table encoding heuristics. + return TargetLowering::getJumpTableEncoding(); +} + +const MCExpr * +X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, + const MachineBasicBlock *MBB, + unsigned uid,MCContext &Ctx) const{ + assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && + Subtarget->isPICStyleGOT()); + // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF + // entries. + return MCSymbolRefExpr::Create(MBB->getSymbol(), + MCSymbolRefExpr::VK_GOTOFF, Ctx); +} + +/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC +/// jumptable. +SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, + SelectionDAG &DAG) const { + if (!Subtarget->is64Bit()) + // This doesn't have DebugLoc associated with it, but is not really the + // same as a Register. + return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); + return Table; +} + +/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the +/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an +/// MCExpr. +const MCExpr *X86TargetLowering:: +getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, + MCContext &Ctx) const { + // X86-64 uses RIP relative addressing based on the jump table label. + if (Subtarget->isPICStyleRIPRel()) + return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); + + // Otherwise, the reference is relative to the PIC base. + return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); +} + +// FIXME: Why this routine is here? Move to RegInfo! +std::pair<const TargetRegisterClass*, uint8_t> +X86TargetLowering::findRepresentativeClass(EVT VT) const{ + const TargetRegisterClass *RRC = 0; + uint8_t Cost = 1; + switch (VT.getSimpleVT().SimpleTy) { + default: + return TargetLowering::findRepresentativeClass(VT); + case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: + RRC = (Subtarget->is64Bit() + ? X86::GR64RegisterClass : X86::GR32RegisterClass); + break; + case MVT::x86mmx: + RRC = X86::VR64RegisterClass; + break; + case MVT::f32: case MVT::f64: + case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: + case MVT::v4f32: case MVT::v2f64: + case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: + case MVT::v4f64: + RRC = X86::VR128RegisterClass; + break; + } + return std::make_pair(RRC, Cost); +} + +bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, + unsigned &Offset) const { + if (!Subtarget->isTargetLinux()) + return false; + + if (Subtarget->is64Bit()) { + // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: + Offset = 0x28; + if (getTargetMachine().getCodeModel() == CodeModel::Kernel) + AddressSpace = 256; + else + AddressSpace = 257; + } else { + // %gs:0x14 on i386 + Offset = 0x14; + AddressSpace = 256; + } + return true; +} + + +//===----------------------------------------------------------------------===// +// Return Value Calling Convention Implementation +//===----------------------------------------------------------------------===// + +#include "X86GenCallingConv.inc" + +bool +X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, + MachineFunction &MF, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + RVLocs, Context); + return CCInfo.CheckReturn(Outs, RetCC_X86); +} + +SDValue +X86TargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + DebugLoc dl, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + RVLocs, *DAG.getContext()); + CCInfo.AnalyzeReturn(Outs, RetCC_X86); + + // Add the regs to the liveout set for the function. + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + for (unsigned i = 0; i != RVLocs.size(); ++i) + if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) + MRI.addLiveOut(RVLocs[i].getLocReg()); + + SDValue Flag; + + SmallVector<SDValue, 6> RetOps; + RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + // Operand #1 = Bytes To Pop + RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), + MVT::i16)); + + // Copy the result values into the output registers. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + SDValue ValToCopy = OutVals[i]; + EVT ValVT = ValToCopy.getValueType(); + + // If this is x86-64, and we disabled SSE, we can't return FP values, + // or SSE or MMX vectors. + if ((ValVT == MVT::f32 || ValVT == MVT::f64 || + VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && + (Subtarget->is64Bit() && !Subtarget->hasXMM())) { + report_fatal_error("SSE register return with SSE disabled"); + } + // Likewise we can't return F64 values with SSE1 only. gcc does so, but + // llvm-gcc has never done it right and no one has noticed, so this + // should be OK for now. + if (ValVT == MVT::f64 && + (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) + report_fatal_error("SSE2 register return with SSE2 disabled"); + + // Returns in ST0/ST1 are handled specially: these are pushed as operands to + // the RET instruction and handled by the FP Stackifier. + if (VA.getLocReg() == X86::ST0 || + VA.getLocReg() == X86::ST1) { + // If this is a copy from an xmm register to ST(0), use an FPExtend to + // change the value to the FP stack register class. + if (isScalarFPTypeInSSEReg(VA.getValVT())) + ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); + RetOps.push_back(ValToCopy); + // Don't emit a copytoreg. + continue; + } + + // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 + // which is returned in RAX / RDX. + if (Subtarget->is64Bit()) { + if (ValVT == MVT::x86mmx) { + if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { + ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); + ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + ValToCopy); + // If we don't have SSE2 available, convert to v4f32 so the generated + // register is legal. + if (!Subtarget->hasXMMInt()) + ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); + } + } + } + + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); + Flag = Chain.getValue(1); + } + + // The x86-64 ABI for returning structs by value requires that we copy + // the sret argument into %rax for the return. We saved the argument into + // a virtual register in the entry block, so now we copy the value out + // and into %rax. + if (Subtarget->is64Bit() && + DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { + MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + unsigned Reg = FuncInfo->getSRetReturnReg(); + assert(Reg && + "SRetReturnReg should have been set in LowerFormalArguments()."); + SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); + + Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); + Flag = Chain.getValue(1); + + // RAX now acts like a return value. + MRI.addLiveOut(X86::RAX); + } + + RetOps[0] = Chain; // Update chain. + + // Add the flag if we have it. + if (Flag.getNode()) + RetOps.push_back(Flag); + + return DAG.getNode(X86ISD::RET_FLAG, dl, + MVT::Other, &RetOps[0], RetOps.size()); +} + +bool X86TargetLowering::isUsedByReturnOnly(SDNode *N) const { + if (N->getNumValues() != 1) + return false; + if (!N->hasNUsesOfValue(1, 0)) + return false; + + SDNode *Copy = *N->use_begin(); + if (Copy->getOpcode() != ISD::CopyToReg && + Copy->getOpcode() != ISD::FP_EXTEND) + return false; + + bool HasRet = false; + for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); + UI != UE; ++UI) { + if (UI->getOpcode() != X86ISD::RET_FLAG) + return false; + HasRet = true; + } + + return HasRet; +} + +EVT +X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, + ISD::NodeType ExtendKind) const { + MVT ReturnMVT; + // TODO: Is this also valid on 32-bit? + if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) + ReturnMVT = MVT::i8; + else + ReturnMVT = MVT::i32; + + EVT MinVT = getRegisterType(Context, ReturnMVT); + return VT.bitsLT(MinVT) ? MinVT : VT; +} + +/// LowerCallResult - Lower the result values of a call into the +/// appropriate copies out of appropriate physical registers. +/// +SDValue +X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + + // Assign locations to each value returned by this call. + SmallVector<CCValAssign, 16> RVLocs; + bool Is64Bit = Subtarget->is64Bit(); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, RetCC_X86); + + // Copy all of the result registers out of their specified physreg. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign &VA = RVLocs[i]; + EVT CopyVT = VA.getValVT(); + + // If this is x86-64, and we disabled SSE, we can't return FP values + if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && + ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { + report_fatal_error("SSE register return with SSE disabled"); + } + + SDValue Val; + + // If this is a call to a function that returns an fp value on the floating + // point stack, we must guarantee the the value is popped from the stack, so + // a CopyFromReg is not good enough - the copy instruction may be eliminated + // if the return value is not used. We use the FpPOP_RETVAL instruction + // instead. + if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { + // If we prefer to use the value in xmm registers, copy it out as f80 and + // use a truncate to move it from fp stack reg to xmm reg. + if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; + SDValue Ops[] = { Chain, InFlag }; + Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, + MVT::Other, MVT::Glue, Ops, 2), 1); + Val = Chain.getValue(0); + + // Round the f80 to the right size, which also moves it to the appropriate + // xmm register. + if (CopyVT != VA.getValVT()) + Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, + // This truncation won't change the value. + DAG.getIntPtrConstant(1)); + } else { + Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), + CopyVT, InFlag).getValue(1); + Val = Chain.getValue(0); + } + InFlag = Chain.getValue(2); + InVals.push_back(Val); + } + + return Chain; +} + + +//===----------------------------------------------------------------------===// +// C & StdCall & Fast Calling Convention implementation +//===----------------------------------------------------------------------===// +// StdCall calling convention seems to be standard for many Windows' API +// routines and around. It differs from C calling convention just a little: +// callee should clean up the stack, not caller. Symbols should be also +// decorated in some fancy way :) It doesn't support any vector arguments. +// For info on fast calling convention see Fast Calling Convention (tail call) +// implementation LowerX86_32FastCCCallTo. + +/// CallIsStructReturn - Determines whether a call uses struct return +/// semantics. +static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { + if (Outs.empty()) + return false; + + return Outs[0].Flags.isSRet(); +} + +/// ArgsAreStructReturn - Determines whether a function uses struct +/// return semantics. +static bool +ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { + if (Ins.empty()) + return false; + + return Ins[0].Flags.isSRet(); +} + +/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified +/// by "Src" to address "Dst" with size and alignment information specified by +/// the specific parameter attribute. The copy will be passed as a byval +/// function parameter. +static SDValue +CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, + ISD::ArgFlagsTy Flags, SelectionDAG &DAG, + DebugLoc dl) { + SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); + + return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), + /*isVolatile*/false, /*AlwaysInline=*/true, + MachinePointerInfo(), MachinePointerInfo()); +} + +/// IsTailCallConvention - Return true if the calling convention is one that +/// supports tail call optimization. +static bool IsTailCallConvention(CallingConv::ID CC) { + return (CC == CallingConv::Fast || CC == CallingConv::GHC); +} + +bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { + if (!CI->isTailCall()) + return false; + + CallSite CS(CI); + CallingConv::ID CalleeCC = CS.getCallingConv(); + if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) + return false; + + return true; +} + +/// FuncIsMadeTailCallSafe - Return true if the function is being made into +/// a tailcall target by changing its ABI. +static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { + return GuaranteedTailCallOpt && IsTailCallConvention(CC); +} + +SDValue +X86TargetLowering::LowerMemArgument(SDValue Chain, + CallingConv::ID CallConv, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + MachineFrameInfo *MFI, + unsigned i) const { + // Create the nodes corresponding to a load from this parameter slot. + ISD::ArgFlagsTy Flags = Ins[i].Flags; + bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); + bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); + EVT ValVT; + + // If value is passed by pointer we have address passed instead of the value + // itself. + if (VA.getLocInfo() == CCValAssign::Indirect) + ValVT = VA.getLocVT(); + else + ValVT = VA.getValVT(); + + // FIXME: For now, all byval parameter objects are marked mutable. This can be + // changed with more analysis. + // In case of tail call optimization mark all arguments mutable. Since they + // could be overwritten by lowering of arguments in case of a tail call. + if (Flags.isByVal()) { + unsigned Bytes = Flags.getByValSize(); + if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. + int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); + return DAG.getFrameIndex(FI, getPointerTy()); + } else { + int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, + VA.getLocMemOffset(), isImmutable); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + return DAG.getLoad(ValVT, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(FI), + false, false, 0); + } +} + +SDValue +X86TargetLowering::LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, + SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) + const { + MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + + const Function* Fn = MF.getFunction(); + if (Fn->hasExternalLinkage() && + Subtarget->isTargetCygMing() && + Fn->getName() == "main") + FuncInfo->setForceFramePointer(true); + + MachineFrameInfo *MFI = MF.getFrameInfo(); + bool Is64Bit = Subtarget->is64Bit(); + bool IsWin64 = Subtarget->isTargetWin64(); + + assert(!(isVarArg && IsTailCallConvention(CallConv)) && + "Var args not supported with calling convention fastcc or ghc"); + + // Assign locations to all of the incoming arguments. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + ArgLocs, *DAG.getContext()); + + // Allocate shadow area for Win64 + if (IsWin64) { + CCInfo.AllocateStack(32, 8); + } + + CCInfo.AnalyzeFormalArguments(Ins, CC_X86); + + unsigned LastVal = ~0U; + SDValue ArgValue; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + // TODO: If an arg is passed in two places (e.g. reg and stack), skip later + // places. + assert(VA.getValNo() != LastVal && + "Don't support value assigned to multiple locs yet"); + (void)LastVal; + LastVal = VA.getValNo(); + + if (VA.isRegLoc()) { + EVT RegVT = VA.getLocVT(); + TargetRegisterClass *RC = NULL; + if (RegVT == MVT::i32) + RC = X86::GR32RegisterClass; + else if (Is64Bit && RegVT == MVT::i64) + RC = X86::GR64RegisterClass; + else if (RegVT == MVT::f32) + RC = X86::FR32RegisterClass; + else if (RegVT == MVT::f64) + RC = X86::FR64RegisterClass; + else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) + RC = X86::VR256RegisterClass; + else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) + RC = X86::VR128RegisterClass; + else if (RegVT == MVT::x86mmx) + RC = X86::VR64RegisterClass; + else + llvm_unreachable("Unknown argument type!"); + + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); + + // If this is an 8 or 16-bit value, it is really passed promoted to 32 + // bits. Insert an assert[sz]ext to capture this, then truncate to the + // right size. + if (VA.getLocInfo() == CCValAssign::SExt) + ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + else if (VA.getLocInfo() == CCValAssign::ZExt) + ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + else if (VA.getLocInfo() == CCValAssign::BCvt) + ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); + + if (VA.isExtInLoc()) { + // Handle MMX values passed in XMM regs. + if (RegVT.isVector()) { + ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), + ArgValue); + } else + ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); + } + } else { + assert(VA.isMemLoc()); + ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); + } + + // If value is passed via pointer - do a load. + if (VA.getLocInfo() == CCValAssign::Indirect) + ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, + MachinePointerInfo(), false, false, 0); + + InVals.push_back(ArgValue); + } + + // The x86-64 ABI for returning structs by value requires that we copy + // the sret argument into %rax for the return. Save the argument into + // a virtual register so that we can access it from the return points. + if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + unsigned Reg = FuncInfo->getSRetReturnReg(); + if (!Reg) { + Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); + FuncInfo->setSRetReturnReg(Reg); + } + SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); + } + + unsigned StackSize = CCInfo.getNextStackOffset(); + // Align stack specially for tail calls. + if (FuncIsMadeTailCallSafe(CallConv)) + StackSize = GetAlignedArgumentStackSize(StackSize, DAG); + + // If the function takes variable number of arguments, make a frame index for + // the start of the first vararg value... for expansion of llvm.va_start. + if (isVarArg) { + if (Is64Bit || (CallConv != CallingConv::X86_FastCall && + CallConv != CallingConv::X86_ThisCall)) { + FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); + } + if (Is64Bit) { + unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; + + // FIXME: We should really autogenerate these arrays + static const unsigned GPR64ArgRegsWin64[] = { + X86::RCX, X86::RDX, X86::R8, X86::R9 + }; + static const unsigned GPR64ArgRegs64Bit[] = { + X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 + }; + static const unsigned XMMArgRegs64Bit[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + const unsigned *GPR64ArgRegs; + unsigned NumXMMRegs = 0; + + if (IsWin64) { + // The XMM registers which might contain var arg parameters are shadowed + // in their paired GPR. So we only need to save the GPR to their home + // slots. + TotalNumIntRegs = 4; + GPR64ArgRegs = GPR64ArgRegsWin64; + } else { + TotalNumIntRegs = 6; TotalNumXMMRegs = 8; + GPR64ArgRegs = GPR64ArgRegs64Bit; + + NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); + } + unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, + TotalNumIntRegs); + + bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); + assert(!(NumXMMRegs && !Subtarget->hasXMM()) && + "SSE register cannot be used when SSE is disabled!"); + assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && + "SSE register cannot be used when SSE is disabled!"); + if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) + // Kernel mode asks for SSE to be disabled, so don't push them + // on the stack. + TotalNumXMMRegs = 0; + + if (IsWin64) { + const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering(); + // Get to the caller-allocated home save location. Add 8 to account + // for the return address. + int HomeOffset = TFI.getOffsetOfLocalArea() + 8; + FuncInfo->setRegSaveFrameIndex( + MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); + // Fixup to set vararg frame on shadow area (4 x i64). + if (NumIntRegs < 4) + FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); + } else { + // For X86-64, if there are vararg parameters that are passed via + // registers, then we must store them to their spots on the stack so they + // may be loaded by deferencing the result of va_next. + FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); + FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); + FuncInfo->setRegSaveFrameIndex( + MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, + false)); + } + + // Store the integer parameter registers. + SmallVector<SDValue, 8> MemOps; + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), + getPointerTy()); + unsigned Offset = FuncInfo->getVarArgsGPOffset(); + for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { + SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, + DAG.getIntPtrConstant(Offset)); + unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], + X86::GR64RegisterClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack( + FuncInfo->getRegSaveFrameIndex(), Offset), + false, false, 0); + MemOps.push_back(Store); + Offset += 8; + } + + if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { + // Now store the XMM (fp + vector) parameter registers. + SmallVector<SDValue, 11> SaveXMMOps; + SaveXMMOps.push_back(Chain); + + unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); + SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); + SaveXMMOps.push_back(ALVal); + + SaveXMMOps.push_back(DAG.getIntPtrConstant( + FuncInfo->getRegSaveFrameIndex())); + SaveXMMOps.push_back(DAG.getIntPtrConstant( + FuncInfo->getVarArgsFPOffset())); + + for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { + unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], + X86::VR128RegisterClass); + SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); + SaveXMMOps.push_back(Val); + } + MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, + MVT::Other, + &SaveXMMOps[0], SaveXMMOps.size())); + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOps[0], MemOps.size()); + } + } + + // Some CCs need callee pop. + if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) { + FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. + } else { + FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. + // If this is an sret function, the return should pop the hidden pointer. + if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) + FuncInfo->setBytesToPopOnReturn(4); + } + + if (!Is64Bit) { + // RegSaveFrameIndex is X86-64 only. + FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); + if (CallConv == CallingConv::X86_FastCall || + CallConv == CallingConv::X86_ThisCall) + // fastcc functions can't have varargs. + FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); + } + + FuncInfo->setArgumentStackSize(StackSize); + + return Chain; +} + +SDValue +X86TargetLowering::LowerMemOpCallTo(SDValue Chain, + SDValue StackPtr, SDValue Arg, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const { + unsigned LocMemOffset = VA.getLocMemOffset(); + SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); + PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); + if (Flags.isByVal()) + return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); + + return DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo::getStack(LocMemOffset), + false, false, 0); +} + +/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call +/// optimization is performed and it is required. +SDValue +X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, + SDValue &OutRetAddr, SDValue Chain, + bool IsTailCall, bool Is64Bit, + int FPDiff, DebugLoc dl) const { + // Adjust the Return address stack slot. + EVT VT = getPointerTy(); + OutRetAddr = getReturnAddressFrameIndex(DAG); + + // Load the "old" Return address. + OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), + false, false, 0); + return SDValue(OutRetAddr.getNode(), 1); +} + +/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call +/// optimization is performed and it is required (FPDiff!=0). +static SDValue +EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, + SDValue Chain, SDValue RetAddrFrIdx, + bool Is64Bit, int FPDiff, DebugLoc dl) { + // Store the return address to the appropriate stack slot. + if (!FPDiff) return Chain; + // Calculate the new stack slot for the return address. + int SlotSize = Is64Bit ? 8 : 4; + int NewReturnAddrFI = + MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); + EVT VT = Is64Bit ? MVT::i64 : MVT::i32; + SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); + Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, + MachinePointerInfo::getFixedStack(NewReturnAddrFI), + false, false, 0); + return Chain; +} + +SDValue +X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, + bool &isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + MachineFunction &MF = DAG.getMachineFunction(); + bool Is64Bit = Subtarget->is64Bit(); + bool IsWin64 = Subtarget->isTargetWin64(); + bool IsStructRet = CallIsStructReturn(Outs); + bool IsSibcall = false; + + if (isTailCall) { + // Check if it's really possible to do a tail call. + isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, + isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), + Outs, OutVals, Ins, DAG); + + // Sibcalls are automatically detected tailcalls which do not require + // ABI changes. + if (!GuaranteedTailCallOpt && isTailCall) + IsSibcall = true; + + if (isTailCall) + ++NumTailCalls; + } + + assert(!(isVarArg && IsTailCallConvention(CallConv)) && + "Var args not supported with calling convention fastcc or ghc"); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), + ArgLocs, *DAG.getContext()); + + // Allocate shadow area for Win64 + if (IsWin64) { + CCInfo.AllocateStack(32, 8); + } + + CCInfo.AnalyzeCallOperands(Outs, CC_X86); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + if (IsSibcall) + // This is a sibcall. The memory operands are available in caller's + // own caller's stack. + NumBytes = 0; + else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) + NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); + + int FPDiff = 0; + if (isTailCall && !IsSibcall) { + // Lower arguments at fp - stackoffset + fpdiff. + unsigned NumBytesCallerPushed = + MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); + FPDiff = NumBytesCallerPushed - NumBytes; + + // Set the delta of movement of the returnaddr stackslot. + // But only set if delta is greater than previous delta. + if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) + MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); + } + + if (!IsSibcall) + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + + SDValue RetAddrFrIdx; + // Load return address for tail calls. + if (isTailCall && FPDiff) + Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, + Is64Bit, FPDiff, dl); + + SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallVector<SDValue, 8> MemOpChains; + SDValue StackPtr; + + // Walk the register/memloc assignments, inserting copies/loads. In the case + // of tail call optimization arguments are handle later. + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + EVT RegVT = VA.getLocVT(); + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + bool isByVal = Flags.isByVal(); + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); + break; + case CCValAssign::AExt: + if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { + // Special case: passing MMX values in XMM registers. + Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); + Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); + Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); + } else + Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); + break; + case CCValAssign::Indirect: { + // Store the argument. + SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); + int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); + Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, + MachinePointerInfo::getFixedStack(FI), + false, false, 0); + Arg = SpillSlot; + break; + } + } + + if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + if (isVarArg && IsWin64) { + // Win64 ABI requires argument XMM reg to be copied to the corresponding + // shadow reg if callee is a varargs function. + unsigned ShadowReg = 0; + switch (VA.getLocReg()) { + case X86::XMM0: ShadowReg = X86::RCX; break; + case X86::XMM1: ShadowReg = X86::RDX; break; + case X86::XMM2: ShadowReg = X86::R8; break; + case X86::XMM3: ShadowReg = X86::R9; break; + } + if (ShadowReg) + RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); + } + } else if (!IsSibcall && (!isTailCall || isByVal)) { + assert(VA.isMemLoc()); + if (StackPtr.getNode() == 0) + StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); + MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, + dl, DAG, VA, Flags)); + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOpChains[0], MemOpChains.size()); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into registers. + SDValue InFlag; + // Tail call byval lowering might overwrite argument registers so in case of + // tail call optimization the copies to registers are lowered later. + if (!isTailCall) + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + if (Subtarget->isPICStyleGOT()) { + // ELF / PIC requires GOT in the EBX register before function calls via PLT + // GOT pointer. + if (!isTailCall) { + Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc(), getPointerTy()), + InFlag); + InFlag = Chain.getValue(1); + } else { + // If we are tail calling and generating PIC/GOT style code load the + // address of the callee into ECX. The value in ecx is used as target of + // the tail jump. This is done to circumvent the ebx/callee-saved problem + // for tail calls on PIC/GOT architectures. Normally we would just put the + // address of GOT into ebx and then call target@PLT. But for tail calls + // ebx would be restored (since ebx is callee saved) before jumping to the + // target@PLT. + + // Note: The actual moving to ECX is done further down. + GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); + if (G && !G->getGlobal()->hasHiddenVisibility() && + !G->getGlobal()->hasProtectedVisibility()) + Callee = LowerGlobalAddress(Callee, DAG); + else if (isa<ExternalSymbolSDNode>(Callee)) + Callee = LowerExternalSymbol(Callee, DAG); + } + } + + if (Is64Bit && isVarArg && !IsWin64) { + // From AMD64 ABI document: + // For calls that may call functions that use varargs or stdargs + // (prototype-less calls or calls to functions containing ellipsis (...) in + // the declaration) %al is used as hidden argument to specify the number + // of SSE registers used. The contents of %al do not need to match exactly + // the number of registers, but must be an ubound on the number of SSE + // registers used and is in the range 0 - 8 inclusive. + + // Count the number of XMM registers allocated. + static const unsigned XMMArgRegs[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); + assert((Subtarget->hasXMM() || !NumXMMRegs) + && "SSE registers cannot be used when SSE is disabled"); + + Chain = DAG.getCopyToReg(Chain, dl, X86::AL, + DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); + InFlag = Chain.getValue(1); + } + + + // For tail calls lower the arguments to the 'real' stack slot. + if (isTailCall) { + // Force all the incoming stack arguments to be loaded from the stack + // before any new outgoing arguments are stored to the stack, because the + // outgoing stack slots may alias the incoming argument stack slots, and + // the alias isn't otherwise explicit. This is slightly more conservative + // than necessary, because it means that each store effectively depends + // on every argument instead of just those arguments it would clobber. + SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); + + SmallVector<SDValue, 8> MemOpChains2; + SDValue FIN; + int FI = 0; + // Do not flag preceding copytoreg stuff together with the following stuff. + InFlag = SDValue(); + if (GuaranteedTailCallOpt) { + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (VA.isRegLoc()) + continue; + assert(VA.isMemLoc()); + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + // Create frame index. + int32_t Offset = VA.getLocMemOffset()+FPDiff; + uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; + FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); + FIN = DAG.getFrameIndex(FI, getPointerTy()); + + if (Flags.isByVal()) { + // Copy relative to framepointer. + SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); + if (StackPtr.getNode() == 0) + StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, + getPointerTy()); + Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); + + MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, + ArgChain, + Flags, DAG, dl)); + } else { + // Store relative to framepointer. + MemOpChains2.push_back( + DAG.getStore(ArgChain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(FI), + false, false, 0)); + } + } + } + + if (!MemOpChains2.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &MemOpChains2[0], MemOpChains2.size()); + + // Copy arguments to their registers. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + InFlag =SDValue(); + + // Store the return address to the appropriate stack slot. + Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, + FPDiff, dl); + } + + if (getTargetMachine().getCodeModel() == CodeModel::Large) { + assert(Is64Bit && "Large code model is only legal in 64-bit mode."); + // In the 64-bit large code model, we have to make all calls + // through a register, since the call instruction's 32-bit + // pc-relative offset may not be large enough to hold the whole + // address. + } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + // If the callee is a GlobalAddress node (quite common, every direct call + // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack + // it. + + // We should use extra load for direct calls to dllimported functions in + // non-JIT mode. + const GlobalValue *GV = G->getGlobal(); + if (!GV->hasDLLImportLinkage()) { + unsigned char OpFlags = 0; + bool ExtraLoad = false; + unsigned WrapperKind = ISD::DELETED_NODE; + + // On ELF targets, in both X86-64 and X86-32 mode, direct calls to + // external symbols most go through the PLT in PIC mode. If the symbol + // has hidden or protected visibility, or if it is static or local, then + // we don't need to use the PLT - we can directly call it. + if (Subtarget->isTargetELF() && + getTargetMachine().getRelocationModel() == Reloc::PIC_ && + GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { + OpFlags = X86II::MO_PLT; + } else if (Subtarget->isPICStyleStubAny() && + (GV->isDeclaration() || GV->isWeakForLinker()) && + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = X86II::MO_DARWIN_STUB; + } else if (Subtarget->isPICStyleRIPRel() && + isa<Function>(GV) && + cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) { + // If the function is marked as non-lazy, generate an indirect call + // which loads from the GOT directly. This avoids runtime overhead + // at the cost of eager binding (and one extra byte of encoding). + OpFlags = X86II::MO_GOTPCREL; + WrapperKind = X86ISD::WrapperRIP; + ExtraLoad = true; + } + + Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), + G->getOffset(), OpFlags); + + // Add a wrapper if needed. + if (WrapperKind != ISD::DELETED_NODE) + Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); + // Add extra indirection if needed. + if (ExtraLoad) + Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, + MachinePointerInfo::getGOT(), + false, false, 0); + } + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { + unsigned char OpFlags = 0; + + // On ELF targets, in either X86-64 or X86-32 mode, direct calls to + // external symbols should go through the PLT. + if (Subtarget->isTargetELF() && + getTargetMachine().getRelocationModel() == Reloc::PIC_) { + OpFlags = X86II::MO_PLT; + } else if (Subtarget->isPICStyleStubAny() && + (!Subtarget->getTargetTriple().isMacOSX() || + Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { + // PC-relative references to external symbols should go through $stub, + // unless we're building with the leopard linker or later, which + // automatically synthesizes these stubs. + OpFlags = X86II::MO_DARWIN_STUB; + } + + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), + OpFlags); + } + + // Returns a chain & a flag for retval copy to use. + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + SmallVector<SDValue, 8> Ops; + + if (!IsSibcall && isTailCall) { + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), + DAG.getIntPtrConstant(0, true), InFlag); + InFlag = Chain.getValue(1); + } + + Ops.push_back(Chain); + Ops.push_back(Callee); + + if (isTailCall) + Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + // Add an implicit use GOT pointer in EBX. + if (!isTailCall && Subtarget->isPICStyleGOT()) + Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); + + // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. + if (Is64Bit && isVarArg && !IsWin64) + Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); + + if (InFlag.getNode()) + Ops.push_back(InFlag); + + if (isTailCall) { + // We used to do: + //// If this is the first return lowered for this function, add the regs + //// to the liveout set for the function. + // This isn't right, although it's probably harmless on x86; liveouts + // should be computed from returns not tail calls. Consider a void + // function making a tail call to a function returning int. + return DAG.getNode(X86ISD::TC_RETURN, dl, + NodeTys, &Ops[0], Ops.size()); + } + + Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + // Create the CALLSEQ_END node. + unsigned NumBytesForCalleeToPush; + if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, GuaranteedTailCallOpt)) + NumBytesForCalleeToPush = NumBytes; // Callee pops everything + else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) + // If this is a call to a struct-return function, the callee + // pops the hidden struct pointer, so we have to push it back. + // This is common for Darwin/X86, Linux & Mingw32 targets. + NumBytesForCalleeToPush = 4; + else + NumBytesForCalleeToPush = 0; // Callee pops nothing. + + // Returns a flag for retval copy to use. + if (!IsSibcall) { + Chain = DAG.getCALLSEQ_END(Chain, + DAG.getIntPtrConstant(NumBytes, true), + DAG.getIntPtrConstant(NumBytesForCalleeToPush, + true), + InFlag); + InFlag = Chain.getValue(1); + } + + // Handle result values, copying them out of physregs into vregs that we + // return. + return LowerCallResult(Chain, InFlag, CallConv, isVarArg, + Ins, dl, DAG, InVals); +} + + +//===----------------------------------------------------------------------===// +// Fast Calling Convention (tail call) implementation +//===----------------------------------------------------------------------===// + +// Like std call, callee cleans arguments, convention except that ECX is +// reserved for storing the tail called function address. Only 2 registers are +// free for argument passing (inreg). Tail call optimization is performed +// provided: +// * tailcallopt is enabled +// * caller/callee are fastcc +// On X86_64 architecture with GOT-style position independent code only local +// (within module) calls are supported at the moment. +// To keep the stack aligned according to platform abi the function +// GetAlignedArgumentStackSize ensures that argument delta is always multiples +// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) +// If a tail called function callee has more arguments than the caller the +// caller needs to make sure that there is room to move the RETADDR to. This is +// achieved by reserving an area the size of the argument delta right after the +// original REtADDR, but before the saved framepointer or the spilled registers +// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) +// stack layout: +// arg1 +// arg2 +// RETADDR +// [ new RETADDR +// move area ] +// (possible EBP) +// ESI +// EDI +// local1 .. + +/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned +/// for a 16 byte align requirement. +unsigned +X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, + SelectionDAG& DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + const TargetMachine &TM = MF.getTarget(); + const TargetFrameLowering &TFI = *TM.getFrameLowering(); + unsigned StackAlignment = TFI.getStackAlignment(); + uint64_t AlignMask = StackAlignment - 1; + int64_t Offset = StackSize; + uint64_t SlotSize = TD->getPointerSize(); + if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { + // Number smaller than 12 so just add the difference. + Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); + } else { + // Mask out lower bits, add stackalignment once plus the 12 bytes. + Offset = ((~AlignMask) & Offset) + StackAlignment + + (StackAlignment-SlotSize); + } + return Offset; +} + +/// MatchingStackOffset - Return true if the given stack call argument is +/// already available in the same position (relatively) of the caller's +/// incoming argument stack. +static +bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, + MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, + const X86InstrInfo *TII) { + unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; + int FI = INT_MAX; + if (Arg.getOpcode() == ISD::CopyFromReg) { + unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); + if (!TargetRegisterInfo::isVirtualRegister(VR)) + return false; + MachineInstr *Def = MRI->getVRegDef(VR); + if (!Def) + return false; + if (!Flags.isByVal()) { + if (!TII->isLoadFromStackSlot(Def, FI)) + return false; + } else { + unsigned Opcode = Def->getOpcode(); + if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && + Def->getOperand(1).isFI()) { + FI = Def->getOperand(1).getIndex(); + Bytes = Flags.getByValSize(); + } else + return false; + } + } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { + if (Flags.isByVal()) + // ByVal argument is passed in as a pointer but it's now being + // dereferenced. e.g. + // define @foo(%struct.X* %A) { + // tail call @bar(%struct.X* byval %A) + // } + return false; + SDValue Ptr = Ld->getBasePtr(); + FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); + if (!FINode) + return false; + FI = FINode->getIndex(); + } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { + FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); + FI = FINode->getIndex(); + Bytes = Flags.getByValSize(); + } else + return false; + + assert(FI != INT_MAX); + if (!MFI->isFixedObjectIndex(FI)) + return false; + return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); +} + +/// IsEligibleForTailCallOptimization - Check whether the call is eligible +/// for tail call optimization. Targets which want to do tail call +/// optimization should implement this function. +bool +X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + bool isCalleeStructRet, + bool isCallerStructRet, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SelectionDAG& DAG) const { + if (!IsTailCallConvention(CalleeCC) && + CalleeCC != CallingConv::C) + return false; + + // If -tailcallopt is specified, make fastcc functions tail-callable. + const MachineFunction &MF = DAG.getMachineFunction(); + const Function *CallerF = DAG.getMachineFunction().getFunction(); + CallingConv::ID CallerCC = CallerF->getCallingConv(); + bool CCMatch = CallerCC == CalleeCC; + + if (GuaranteedTailCallOpt) { + if (IsTailCallConvention(CalleeCC) && CCMatch) + return true; + return false; + } + + // Look for obvious safe cases to perform tail call optimization that do not + // require ABI changes. This is what gcc calls sibcall. + + // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to + // emit a special epilogue. + if (RegInfo->needsStackRealignment(MF)) + return false; + + // Also avoid sibcall optimization if either caller or callee uses struct + // return semantics. + if (isCalleeStructRet || isCallerStructRet) + return false; + + // An stdcall caller is expected to clean up its arguments; the callee + // isn't going to do that. + if (!CCMatch && CallerCC==CallingConv::X86_StdCall) + return false; + + // Do not sibcall optimize vararg calls unless all arguments are passed via + // registers. + if (isVarArg && !Outs.empty()) { + + // Optimizing for varargs on Win64 is unlikely to be safe without + // additional testing. + if (Subtarget->isTargetWin64()) + return false; + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + CCInfo.AnalyzeCallOperands(Outs, CC_X86); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) + if (!ArgLocs[i].isRegLoc()) + return false; + } + + // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. + // Therefore if it's not used by the call it is not safe to optimize this into + // a sibcall. + bool Unused = false; + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + if (!Ins[i].Used) { + Unused = true; + break; + } + } + if (Unused) { + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, RetCC_X86); + for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { + CCValAssign &VA = RVLocs[i]; + if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) + return false; + } + } + + // If the calling conventions do not match, then we'd better make sure the + // results are returned in the same way as what the caller expects. + if (!CCMatch) { + SmallVector<CCValAssign, 16> RVLocs1; + CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs1, *DAG.getContext()); + CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); + + SmallVector<CCValAssign, 16> RVLocs2; + CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs2, *DAG.getContext()); + CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); + + if (RVLocs1.size() != RVLocs2.size()) + return false; + for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { + if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) + return false; + if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) + return false; + if (RVLocs1[i].isRegLoc()) { + if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) + return false; + } else { + if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) + return false; + } + } + } + + // If the callee takes no arguments then go on to check the results of the + // call. + if (!Outs.empty()) { + // Check if stack adjustment is needed. For now, do not do this if any + // argument is passed on the stack. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + // Allocate shadow area for Win64 + if (Subtarget->isTargetWin64()) { + CCInfo.AllocateStack(32, 8); + } + + CCInfo.AnalyzeCallOperands(Outs, CC_X86); + if (CCInfo.getNextStackOffset()) { + MachineFunction &MF = DAG.getMachineFunction(); + if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) + return false; + + // Check if the arguments are already laid out in the right way as + // the caller's fixed stack objects. + MachineFrameInfo *MFI = MF.getFrameInfo(); + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const X86InstrInfo *TII = + ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; + if (VA.getLocInfo() == CCValAssign::Indirect) + return false; + if (!VA.isRegLoc()) { + if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, + MFI, MRI, TII)) + return false; + } + } + } + + // If the tailcall address may be in a register, then make sure it's + // possible to register allocate for it. In 32-bit, the call address can + // only target EAX, EDX, or ECX since the tail call must be scheduled after + // callee-saved registers are restored. These happen to be the same + // registers used to pass 'inreg' arguments so watch out for those. + if (!Subtarget->is64Bit() && + !isa<GlobalAddressSDNode>(Callee) && + !isa<ExternalSymbolSDNode>(Callee)) { + unsigned NumInRegs = 0; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (!VA.isRegLoc()) + continue; + unsigned Reg = VA.getLocReg(); + switch (Reg) { + default: break; + case X86::EAX: case X86::EDX: case X86::ECX: + if (++NumInRegs == 3) + return false; + break; + } + } + } + } + + return true; +} + +FastISel * +X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { + return X86::createFastISel(funcInfo); +} + + +//===----------------------------------------------------------------------===// +// Other Lowering Hooks +//===----------------------------------------------------------------------===// + +static bool MayFoldLoad(SDValue Op) { + return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); +} + +static bool MayFoldIntoStore(SDValue Op) { + return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); +} + +static bool isTargetShuffle(unsigned Opcode) { + switch(Opcode) { + default: return false; + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::SHUFPD: + case X86ISD::PALIGN: + case X86ISD::SHUFPS: + case X86ISD::MOVLHPS: + case X86ISD::MOVLHPD: + case X86ISD::MOVHLPS: + case X86ISD::MOVLPS: + case X86ISD::MOVLPD: + case X86ISD::MOVSHDUP: + case X86ISD::MOVSLDUP: + case X86ISD::MOVDDUP: + case X86ISD::MOVSS: + case X86ISD::MOVSD: + case X86ISD::UNPCKLPS: + case X86ISD::UNPCKLPD: + case X86ISD::VUNPCKLPSY: + case X86ISD::VUNPCKLPDY: + case X86ISD::PUNPCKLWD: + case X86ISD::PUNPCKLBW: + case X86ISD::PUNPCKLDQ: + case X86ISD::PUNPCKLQDQ: + case X86ISD::UNPCKHPS: + case X86ISD::UNPCKHPD: + case X86ISD::VUNPCKHPSY: + case X86ISD::VUNPCKHPDY: + case X86ISD::PUNPCKHWD: + case X86ISD::PUNPCKHBW: + case X86ISD::PUNPCKHDQ: + case X86ISD::PUNPCKHQDQ: + case X86ISD::VPERMILPS: + case X86ISD::VPERMILPSY: + case X86ISD::VPERMILPD: + case X86ISD::VPERMILPDY: + case X86ISD::VPERM2F128: + return true; + } + return false; +} + +static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue V1, SelectionDAG &DAG) { + switch(Opc) { + default: llvm_unreachable("Unknown x86 shuffle node"); + case X86ISD::MOVSHDUP: + case X86ISD::MOVSLDUP: + case X86ISD::MOVDDUP: + return DAG.getNode(Opc, dl, VT, V1); + } + + return SDValue(); +} + +static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { + switch(Opc) { + default: llvm_unreachable("Unknown x86 shuffle node"); + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::VPERMILPS: + case X86ISD::VPERMILPSY: + case X86ISD::VPERMILPD: + case X86ISD::VPERMILPDY: + return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); + } + + return SDValue(); +} + +static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { + switch(Opc) { + default: llvm_unreachable("Unknown x86 shuffle node"); + case X86ISD::PALIGN: + case X86ISD::SHUFPD: + case X86ISD::SHUFPS: + case X86ISD::VPERM2F128: + return DAG.getNode(Opc, dl, VT, V1, V2, + DAG.getConstant(TargetMask, MVT::i8)); + } + return SDValue(); +} + +static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, + SDValue V1, SDValue V2, SelectionDAG &DAG) { + switch(Opc) { + default: llvm_unreachable("Unknown x86 shuffle node"); + case X86ISD::MOVLHPS: + case X86ISD::MOVLHPD: + case X86ISD::MOVHLPS: + case X86ISD::MOVLPS: + case X86ISD::MOVLPD: + case X86ISD::MOVSS: + case X86ISD::MOVSD: + case X86ISD::UNPCKLPS: + case X86ISD::UNPCKLPD: + case X86ISD::VUNPCKLPSY: + case X86ISD::VUNPCKLPDY: + case X86ISD::PUNPCKLWD: + case X86ISD::PUNPCKLBW: + case X86ISD::PUNPCKLDQ: + case X86ISD::PUNPCKLQDQ: + case X86ISD::UNPCKHPS: + case X86ISD::UNPCKHPD: + case X86ISD::VUNPCKHPSY: + case X86ISD::VUNPCKHPDY: + case X86ISD::PUNPCKHWD: + case X86ISD::PUNPCKHBW: + case X86ISD::PUNPCKHDQ: + case X86ISD::PUNPCKHQDQ: + return DAG.getNode(Opc, dl, VT, V1, V2); + } + return SDValue(); +} + +SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + int ReturnAddrIndex = FuncInfo->getRAIndex(); + + if (ReturnAddrIndex == 0) { + // Set up a frame object for the return address. + uint64_t SlotSize = TD->getPointerSize(); + ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, + false); + FuncInfo->setRAIndex(ReturnAddrIndex); + } + + return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); +} + + +bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, + bool hasSymbolicDisplacement) { + // Offset should fit into 32 bit immediate field. + if (!isInt<32>(Offset)) + return false; + + // If we don't have a symbolic displacement - we don't have any extra + // restrictions. + if (!hasSymbolicDisplacement) + return true; + + // FIXME: Some tweaks might be needed for medium code model. + if (M != CodeModel::Small && M != CodeModel::Kernel) + return false; + + // For small code model we assume that latest object is 16MB before end of 31 + // bits boundary. We may also accept pretty large negative constants knowing + // that all objects are in the positive half of address space. + if (M == CodeModel::Small && Offset < 16*1024*1024) + return true; + + // For kernel code model we know that all object resist in the negative half + // of 32bits address space. We may not accept negative offsets, since they may + // be just off and we may accept pretty large positive ones. + if (M == CodeModel::Kernel && Offset > 0) + return true; + + return false; +} + +/// isCalleePop - Determines whether the callee is required to pop its +/// own arguments. Callee pop is necessary to support tail calls. +bool X86::isCalleePop(CallingConv::ID CallingConv, + bool is64Bit, bool IsVarArg, bool TailCallOpt) { + if (IsVarArg) + return false; + + switch (CallingConv) { + default: + return false; + case CallingConv::X86_StdCall: + return !is64Bit; + case CallingConv::X86_FastCall: + return !is64Bit; + case CallingConv::X86_ThisCall: + return !is64Bit; + case CallingConv::Fast: + return TailCallOpt; + case CallingConv::GHC: + return TailCallOpt; + } +} + +/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 +/// specific condition code, returning the condition code and the LHS/RHS of the +/// comparison to make. +static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, + SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { + if (!isFP) { + if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { + if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { + // X > -1 -> X == 0, jump !sign. + RHS = DAG.getConstant(0, RHS.getValueType()); + return X86::COND_NS; + } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { + // X < 0 -> X == 0, jump on sign. + return X86::COND_S; + } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { + // X < 1 -> X <= 0 + RHS = DAG.getConstant(0, RHS.getValueType()); + return X86::COND_LE; + } + } + + switch (SetCCOpcode) { + default: llvm_unreachable("Invalid integer condition!"); + case ISD::SETEQ: return X86::COND_E; + case ISD::SETGT: return X86::COND_G; + case ISD::SETGE: return X86::COND_GE; + case ISD::SETLT: return X86::COND_L; + case ISD::SETLE: return X86::COND_LE; + case ISD::SETNE: return X86::COND_NE; + case ISD::SETULT: return X86::COND_B; + case ISD::SETUGT: return X86::COND_A; + case ISD::SETULE: return X86::COND_BE; + case ISD::SETUGE: return X86::COND_AE; + } + } + + // First determine if it is required or is profitable to flip the operands. + + // If LHS is a foldable load, but RHS is not, flip the condition. + if (ISD::isNON_EXTLoad(LHS.getNode()) && + !ISD::isNON_EXTLoad(RHS.getNode())) { + SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); + std::swap(LHS, RHS); + } + + switch (SetCCOpcode) { + default: break; + case ISD::SETOLT: + case ISD::SETOLE: + case ISD::SETUGT: + case ISD::SETUGE: + std::swap(LHS, RHS); + break; + } + + // On a floating point condition, the flags are set as follows: + // ZF PF CF op + // 0 | 0 | 0 | X > Y + // 0 | 0 | 1 | X < Y + // 1 | 0 | 0 | X == Y + // 1 | 1 | 1 | unordered + switch (SetCCOpcode) { + default: llvm_unreachable("Condcode should be pre-legalized away"); + case ISD::SETUEQ: + case ISD::SETEQ: return X86::COND_E; + case ISD::SETOLT: // flipped + case ISD::SETOGT: + case ISD::SETGT: return X86::COND_A; + case ISD::SETOLE: // flipped + case ISD::SETOGE: + case ISD::SETGE: return X86::COND_AE; + case ISD::SETUGT: // flipped + case ISD::SETULT: + case ISD::SETLT: return X86::COND_B; + case ISD::SETUGE: // flipped + case ISD::SETULE: + case ISD::SETLE: return X86::COND_BE; + case ISD::SETONE: + case ISD::SETNE: return X86::COND_NE; + case ISD::SETUO: return X86::COND_P; + case ISD::SETO: return X86::COND_NP; + case ISD::SETOEQ: + case ISD::SETUNE: return X86::COND_INVALID; + } +} + +/// hasFPCMov - is there a floating point cmov for the specific X86 condition +/// code. Current x86 isa includes the following FP cmov instructions: +/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. +static bool hasFPCMov(unsigned X86CC) { + switch (X86CC) { + default: + return false; + case X86::COND_B: + case X86::COND_BE: + case X86::COND_E: + case X86::COND_P: + case X86::COND_A: + case X86::COND_AE: + case X86::COND_NE: + case X86::COND_NP: + return true; + } +} + +/// isFPImmLegal - Returns true if the target can instruction select the +/// specified FP immediate natively. If false, the legalizer will +/// materialize the FP immediate as a load from a constant pool. +bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { + if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) + return true; + } + return false; +} + +/// isUndefOrInRange - Return true if Val is undef or if its value falls within +/// the specified range (L, H]. +static bool isUndefOrInRange(int Val, int Low, int Hi) { + return (Val < 0) || (Val >= Low && Val < Hi); +} + +/// isUndefOrInRange - Return true if every element in Mask, begining +/// from position Pos and ending in Pos+Size, falls within the specified +/// range (L, L+Pos]. or is undef. +static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask, + int Pos, int Size, int Low, int Hi) { + for (int i = Pos, e = Pos+Size; i != e; ++i) + if (!isUndefOrInRange(Mask[i], Low, Hi)) + return false; + return true; +} + +/// isUndefOrEqual - Val is either less than zero (undef) or equal to the +/// specified value. +static bool isUndefOrEqual(int Val, int CmpVal) { + if (Val < 0 || Val == CmpVal) + return true; + return false; +} + +/// isSequentialOrUndefInRange - Return true if every element in Mask, begining +/// from position Pos and ending in Pos+Size, falls within the specified +/// sequential range (L, L+Pos]. or is undef. +static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask, + int Pos, int Size, int Low) { + for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low) + if (!isUndefOrEqual(Mask[i], Low)) + return false; + return true; +} + +/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that +/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference +/// the second operand. +static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { + if (VT == MVT::v4f32 || VT == MVT::v4i32 ) + return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); + if (VT == MVT::v2f64 || VT == MVT::v2i64) + return (Mask[0] < 2 && Mask[1] < 2); + return false; +} + +bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { + SmallVector<int, 8> M; + N->getMask(M); + return ::isPSHUFDMask(M, N->getValueType(0)); +} + +/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that +/// is suitable for input to PSHUFHW. +static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { + if (VT != MVT::v8i16) + return false; + + // Lower quadword copied in order or undef. + for (int i = 0; i != 4; ++i) + if (Mask[i] >= 0 && Mask[i] != i) + return false; + + // Upper quadword shuffled. + for (int i = 4; i != 8; ++i) + if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) + return false; + + return true; +} + +bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { + SmallVector<int, 8> M; + N->getMask(M); + return ::isPSHUFHWMask(M, N->getValueType(0)); +} + +/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that +/// is suitable for input to PSHUFLW. +static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { + if (VT != MVT::v8i16) + return false; + + // Upper quadword copied in order. + for (int i = 4; i != 8; ++i) + if (Mask[i] >= 0 && Mask[i] != i) + return false; + + // Lower quadword shuffled. + for (int i = 0; i != 4; ++i) + if (Mask[i] >= 4) + return false; + + return true; +} + +bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { + SmallVector<int, 8> M; + N->getMask(M); + return ::isPSHUFLWMask(M, N->getValueType(0)); +} + +/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that +/// is suitable for input to PALIGNR. +static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, + bool hasSSSE3OrAVX) { + int i, e = VT.getVectorNumElements(); + if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64) + return false; + + // Do not handle v2i64 / v2f64 shuffles with palignr. + if (e < 4 || !hasSSSE3OrAVX) + return false; + + for (i = 0; i != e; ++i) + if (Mask[i] >= 0) + break; + + // All undef, not a palignr. + if (i == e) + return false; + + // Make sure we're shifting in the right direction. + if (Mask[i] <= i) + return false; + + int s = Mask[i] - i; + + // Check the rest of the elements to see if they are consecutive. + for (++i; i != e; ++i) { + int m = Mask[i]; + if (m >= 0 && m != s+i) + return false; + } + return true; +} + +/// isVSHUFPSYMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to 256-bit +/// VSHUFPSY. +static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT, + const X86Subtarget *Subtarget) { + int NumElems = VT.getVectorNumElements(); + + if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) + return false; + + if (NumElems != 8) + return false; + + // VSHUFPSY divides the resulting vector into 4 chunks. + // The sources are also splitted into 4 chunks, and each destination + // chunk must come from a different source chunk. + // + // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 + // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 + // + // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, + // Y3..Y0, Y3..Y0, X3..X0, X3..X0 + // + int QuarterSize = NumElems/4; + int HalfSize = QuarterSize*2; + for (int i = 0; i < QuarterSize; ++i) + if (!isUndefOrInRange(Mask[i], 0, HalfSize)) + return false; + for (int i = QuarterSize; i < QuarterSize*2; ++i) + if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) + return false; + + // The mask of the second half must be the same as the first but with + // the appropriate offsets. This works in the same way as VPERMILPS + // works with masks. + for (int i = QuarterSize*2; i < QuarterSize*3; ++i) { + if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) + return false; + int FstHalfIdx = i-HalfSize; + if (Mask[FstHalfIdx] < 0) + continue; + if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) + return false; + } + for (int i = QuarterSize*3; i < NumElems; ++i) { + if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) + return false; + int FstHalfIdx = i-HalfSize; + if (Mask[FstHalfIdx] < 0) + continue; + if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize)) + return false; + + } + + return true; +} + +/// getShuffleVSHUFPSYImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VSHUFPSY instruction. +static unsigned getShuffleVSHUFPSYImmediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + EVT VT = SVOp->getValueType(0); + int NumElems = VT.getVectorNumElements(); + + assert(NumElems == 8 && VT.getSizeInBits() == 256 && + "Only supports v8i32 and v8f32 types"); + + int HalfSize = NumElems/2; + unsigned Mask = 0; + for (int i = 0; i != NumElems ; ++i) { + if (SVOp->getMaskElt(i) < 0) + continue; + // The mask of the first half must be equal to the second one. + unsigned Shamt = (i%HalfSize)*2; + unsigned Elt = SVOp->getMaskElt(i) % HalfSize; + Mask |= Elt << Shamt; + } + + return Mask; +} + +/// isVSHUFPDYMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to 256-bit +/// VSHUFPDY. This shuffle doesn't have the same restriction as the PS +/// version and the mask of the second half isn't binded with the first +/// one. +static bool isVSHUFPDYMask(const SmallVectorImpl<int> &Mask, EVT VT, + const X86Subtarget *Subtarget) { + int NumElems = VT.getVectorNumElements(); + + if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) + return false; + + if (NumElems != 4) + return false; + + // VSHUFPSY divides the resulting vector into 4 chunks. + // The sources are also splitted into 4 chunks, and each destination + // chunk must come from a different source chunk. + // + // SRC1 => X3 X2 X1 X0 + // SRC2 => Y3 Y2 Y1 Y0 + // + // DST => Y2..Y3, X2..X3, Y1..Y0, X1..X0 + // + int QuarterSize = NumElems/4; + int HalfSize = QuarterSize*2; + for (int i = 0; i < QuarterSize; ++i) + if (!isUndefOrInRange(Mask[i], 0, HalfSize)) + return false; + for (int i = QuarterSize; i < QuarterSize*2; ++i) + if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize)) + return false; + for (int i = QuarterSize*2; i < QuarterSize*3; ++i) + if (!isUndefOrInRange(Mask[i], HalfSize, NumElems)) + return false; + for (int i = QuarterSize*3; i < NumElems; ++i) + if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2)) + return false; + + return true; +} + +/// getShuffleVSHUFPDYImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VSHUFPDY instruction. +static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + EVT VT = SVOp->getValueType(0); + int NumElems = VT.getVectorNumElements(); + + assert(NumElems == 4 && VT.getSizeInBits() == 256 && + "Only supports v4i64 and v4f64 types"); + + int HalfSize = NumElems/2; + unsigned Mask = 0; + for (int i = 0; i != NumElems ; ++i) { + if (SVOp->getMaskElt(i) < 0) + continue; + int Elt = SVOp->getMaskElt(i) % HalfSize; + Mask |= Elt << i; + } + + return Mask; +} + +/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to 128-bit +/// SHUFPS and SHUFPD. +static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { + int NumElems = VT.getVectorNumElements(); + + if (VT.getSizeInBits() != 128) + return false; + + if (NumElems != 2 && NumElems != 4) + return false; + + int Half = NumElems / 2; + for (int i = 0; i < Half; ++i) + if (!isUndefOrInRange(Mask[i], 0, NumElems)) + return false; + for (int i = Half; i < NumElems; ++i) + if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) + return false; + + return true; +} + +bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { + SmallVector<int, 8> M; + N->getMask(M); + return ::isSHUFPMask(M, N->getValueType(0)); +} + +/// isCommutedSHUFP - Returns true if the shuffle mask is exactly +/// the reverse of what x86 shuffles want. x86 shuffles requires the lower +/// half elements to come from vector 1 (which would equal the dest.) and +/// the upper half to come from vector 2. +static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { + int NumElems = VT.getVectorNumElements(); + + if (NumElems != 2 && NumElems != 4) + return false; + + int Half = NumElems / 2; + for (int i = 0; i < Half; ++i) + if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) + return false; + for (int i = Half; i < NumElems; ++i) + if (!isUndefOrInRange(Mask[i], 0, NumElems)) + return false; + return true; +} + +static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { + SmallVector<int, 8> M; + N->getMask(M); + return isCommutedSHUFPMask(M, N->getValueType(0)); +} + +/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVHLPS. +bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { + EVT VT = N->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + + if (VT.getSizeInBits() != 128) + return false; + + if (NumElems != 4) + return false; + + // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 + return isUndefOrEqual(N->getMaskElt(0), 6) && + isUndefOrEqual(N->getMaskElt(1), 7) && + isUndefOrEqual(N->getMaskElt(2), 2) && + isUndefOrEqual(N->getMaskElt(3), 3); +} + +/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form +/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, +/// <2, 3, 2, 3> +bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { + EVT VT = N->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + + if (VT.getSizeInBits() != 128) + return false; + + if (NumElems != 4) + return false; + + return isUndefOrEqual(N->getMaskElt(0), 2) && + isUndefOrEqual(N->getMaskElt(1), 3) && + isUndefOrEqual(N->getMaskElt(2), 2) && + isUndefOrEqual(N->getMaskElt(3), 3); +} + +/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. +bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { + unsigned NumElems = N->getValueType(0).getVectorNumElements(); + + if (NumElems != 2 && NumElems != 4) + return false; + + for (unsigned i = 0; i < NumElems/2; ++i) + if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) + return false; + + for (unsigned i = NumElems/2; i < NumElems; ++i) + if (!isUndefOrEqual(N->getMaskElt(i), i)) + return false; + + return true; +} + +/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVLHPS. +bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { + unsigned NumElems = N->getValueType(0).getVectorNumElements(); + + if ((NumElems != 2 && NumElems != 4) + || N->getValueType(0).getSizeInBits() > 128) + return false; + + for (unsigned i = 0; i < NumElems/2; ++i) + if (!isUndefOrEqual(N->getMaskElt(i), i)) + return false; + + for (unsigned i = 0; i < NumElems/2; ++i) + if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) + return false; + + return true; +} + +/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to UNPCKL. +static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, + bool V2IsSplat = false) { + int NumElts = VT.getVectorNumElements(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for unpckh"); + + if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) + return false; + + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + unsigned Start = 0; + unsigned End = NumLaneElts; + for (unsigned s = 0; s < NumLanes; ++s) { + for (unsigned i = Start, j = s * NumLaneElts; + i != End; + i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (V2IsSplat) { + if (!isUndefOrEqual(BitI1, NumElts)) + return false; + } else { + if (!isUndefOrEqual(BitI1, j + NumElts)) + return false; + } + } + // Process the next 128 bits. + Start += NumLaneElts; + End += NumLaneElts; + } + + return true; +} + +bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { + SmallVector<int, 8> M; + N->getMask(M); + return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); +} + +/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to UNPCKH. +static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, + bool V2IsSplat = false) { + int NumElts = VT.getVectorNumElements(); + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Unsupported vector type for unpckh"); + + if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8) + return false; + + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + unsigned Start = 0; + unsigned End = NumLaneElts; + for (unsigned l = 0; l != NumLanes; ++l) { + for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2; + i != End; i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (V2IsSplat) { + if (isUndefOrEqual(BitI1, NumElts)) + return false; + } else { + if (!isUndefOrEqual(BitI1, j+NumElts)) + return false; + } + } + // Process the next 128 bits. + Start += NumLaneElts; + End += NumLaneElts; + } + return true; +} + +bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { + SmallVector<int, 8> M; + N->getMask(M); + return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); +} + +/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form +/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, +/// <0, 0, 1, 1> +static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { + int NumElems = VT.getVectorNumElements(); + if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) + return false; + + // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern + // FIXME: Need a better way to get rid of this, there's no latency difference + // between UNPCKLPD and MOVDDUP, the later should always be checked first and + // the former later. We should also remove the "_undef" special mask. + if (NumElems == 4 && VT.getSizeInBits() == 256) + return false; + + // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate + // independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumLaneElts = NumElems / NumLanes; + + for (unsigned s = 0; s < NumLanes; ++s) { + for (unsigned i = s * NumLaneElts, j = s * NumLaneElts; + i != NumLaneElts * (s + 1); + i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + + if (!isUndefOrEqual(BitI, j)) + return false; + if (!isUndefOrEqual(BitI1, j)) + return false; + } + } + + return true; +} + +bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { + SmallVector<int, 8> M; + N->getMask(M); + return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); +} + +/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form +/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, +/// <2, 2, 3, 3> +static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { + int NumElems = VT.getVectorNumElements(); + if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) + return false; + + for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { + int BitI = Mask[i]; + int BitI1 = Mask[i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (!isUndefOrEqual(BitI1, j)) + return false; + } + return true; +} + +bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { + SmallVector<int, 8> M; + N->getMask(M); + return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); +} + +/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVSS, +/// MOVSD, and MOVD, i.e. setting the lowest element. +static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { + if (VT.getVectorElementType().getSizeInBits() < 32) + return false; + + int NumElts = VT.getVectorNumElements(); + + if (!isUndefOrEqual(Mask[0], NumElts)) + return false; + + for (int i = 1; i < NumElts; ++i) + if (!isUndefOrEqual(Mask[i], i)) + return false; + + return true; +} + +bool X86::isMOVLMask(ShuffleVectorSDNode *N) { + SmallVector<int, 8> M; + N->getMask(M); + return ::isMOVLMask(M, N->getValueType(0)); +} + +/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered +/// as permutations between 128-bit chunks or halves. As an example: this +/// shuffle bellow: +/// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> +/// The first half comes from the second half of V1 and the second half from the +/// the second half of V2. +static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT, + const X86Subtarget *Subtarget) { + if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256) + return false; + + // The shuffle result is divided into half A and half B. In total the two + // sources have 4 halves, namely: C, D, E, F. The final values of A and + // B must come from C, D, E or F. + int HalfSize = VT.getVectorNumElements()/2; + bool MatchA = false, MatchB = false; + + // Check if A comes from one of C, D, E, F. + for (int Half = 0; Half < 4; ++Half) { + if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { + MatchA = true; + break; + } + } + + // Check if B comes from one of C, D, E, F. + for (int Half = 0; Half < 4; ++Half) { + if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { + MatchB = true; + break; + } + } + + return MatchA && MatchB; +} + +/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERM2F128 instructions. +static unsigned getShuffleVPERM2F128Immediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + EVT VT = SVOp->getValueType(0); + + int HalfSize = VT.getVectorNumElements()/2; + + int FstHalf = 0, SndHalf = 0; + for (int i = 0; i < HalfSize; ++i) { + if (SVOp->getMaskElt(i) > 0) { + FstHalf = SVOp->getMaskElt(i)/HalfSize; + break; + } + } + for (int i = HalfSize; i < HalfSize*2; ++i) { + if (SVOp->getMaskElt(i) > 0) { + SndHalf = SVOp->getMaskElt(i)/HalfSize; + break; + } + } + + return (FstHalf | (SndHalf << 4)); +} + +/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to VPERMILPD*. +/// Note that VPERMIL mask matching is different depending whether theunderlying +/// type is 32 or 64. In the VPERMILPS the high half of the mask should point +/// to the same elements of the low, but to the higher half of the source. +/// In VPERMILPD the two lanes could be shuffled independently of each other +/// with the same restriction that lanes can't be crossed. +static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT, + const X86Subtarget *Subtarget) { + int NumElts = VT.getVectorNumElements(); + int NumLanes = VT.getSizeInBits()/128; + + if (!Subtarget->hasAVX()) + return false; + + // Only match 256-bit with 64-bit types + if (VT.getSizeInBits() != 256 || NumElts != 4) + return false; + + // The mask on the high lane is independent of the low. Both can match + // any element in inside its own lane, but can't cross. + int LaneSize = NumElts/NumLanes; + for (int l = 0; l < NumLanes; ++l) + for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { + int LaneStart = l*LaneSize; + if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize)) + return false; + } + + return true; +} + +/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to VPERMILPS*. +/// Note that VPERMIL mask matching is different depending whether theunderlying +/// type is 32 or 64. In the VPERMILPS the high half of the mask should point +/// to the same elements of the low, but to the higher half of the source. +/// In VPERMILPD the two lanes could be shuffled independently of each other +/// with the same restriction that lanes can't be crossed. +static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT, + const X86Subtarget *Subtarget) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits()/128; + + if (!Subtarget->hasAVX()) + return false; + + // Only match 256-bit with 32-bit types + if (VT.getSizeInBits() != 256 || NumElts != 8) + return false; + + // The mask on the high lane should be the same as the low. Actually, + // they can differ if any of the corresponding index in a lane is undef + // and the other stays in range. + int LaneSize = NumElts/NumLanes; + for (int i = 0; i < LaneSize; ++i) { + int HighElt = i+LaneSize; + bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts); + bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize); + + if (!HighValid || !LowValid) + return false; + if (Mask[i] < 0 || Mask[HighElt] < 0) + continue; + if (Mask[HighElt]-Mask[i] != LaneSize) + return false; + } + + return true; +} + +/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERMILPS* instructions. +static unsigned getShuffleVPERMILPSImmediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + EVT VT = SVOp->getValueType(0); + + int NumElts = VT.getVectorNumElements(); + int NumLanes = VT.getSizeInBits()/128; + int LaneSize = NumElts/NumLanes; + + // Although the mask is equal for both lanes do it twice to get the cases + // where a mask will match because the same mask element is undef on the + // first half but valid on the second. This would get pathological cases + // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid. + unsigned Mask = 0; + for (int l = 0; l < NumLanes; ++l) { + for (int i = 0; i < LaneSize; ++i) { + int MaskElt = SVOp->getMaskElt(i+(l*LaneSize)); + if (MaskElt < 0) + continue; + if (MaskElt >= LaneSize) + MaskElt -= LaneSize; + Mask |= MaskElt << (i*2); + } + } + + return Mask; +} + +/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_MASK mask with VPERMILPD* instructions. +static unsigned getShuffleVPERMILPDImmediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + EVT VT = SVOp->getValueType(0); + + int NumElts = VT.getVectorNumElements(); + int NumLanes = VT.getSizeInBits()/128; + + unsigned Mask = 0; + int LaneSize = NumElts/NumLanes; + for (int l = 0; l < NumLanes; ++l) + for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) { + int MaskElt = SVOp->getMaskElt(i); + if (MaskElt < 0) + continue; + Mask |= (MaskElt-l*LaneSize) << i; + } + + return Mask; +} + +/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse +/// of what x86 movss want. X86 movs requires the lowest element to be lowest +/// element of vector 2 and the other elements to come from vector 1 in order. +static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, + bool V2IsSplat = false, bool V2IsUndef = false) { + int NumOps = VT.getVectorNumElements(); + if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) + return false; + + if (!isUndefOrEqual(Mask[0], 0)) + return false; + + for (int i = 1; i < NumOps; ++i) + if (!(isUndefOrEqual(Mask[i], i+NumOps) || + (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || + (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) + return false; + + return true; +} + +static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, + bool V2IsUndef = false) { + SmallVector<int, 8> M; + N->getMask(M); + return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); +} + +/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. +/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> +bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N, + const X86Subtarget *Subtarget) { + if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) + return false; + + // The second vector must be undef + if (N->getOperand(1).getOpcode() != ISD::UNDEF) + return false; + + EVT VT = N->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + + if ((VT.getSizeInBits() == 128 && NumElems != 4) || + (VT.getSizeInBits() == 256 && NumElems != 8)) + return false; + + // "i+1" is the value the indexed mask element must have + for (unsigned i = 0; i < NumElems; i += 2) + if (!isUndefOrEqual(N->getMaskElt(i), i+1) || + !isUndefOrEqual(N->getMaskElt(i+1), i+1)) + return false; + + return true; +} + +/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. +/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> +bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N, + const X86Subtarget *Subtarget) { + if (!Subtarget->hasSSE3() && !Subtarget->hasAVX()) + return false; + + // The second vector must be undef + if (N->getOperand(1).getOpcode() != ISD::UNDEF) + return false; + + EVT VT = N->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + + if ((VT.getSizeInBits() == 128 && NumElems != 4) || + (VT.getSizeInBits() == 256 && NumElems != 8)) + return false; + + // "i" is the value the indexed mask element must have + for (unsigned i = 0; i < NumElems; i += 2) + if (!isUndefOrEqual(N->getMaskElt(i), i) || + !isUndefOrEqual(N->getMaskElt(i+1), i)) + return false; + + return true; +} + +/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to 256-bit +/// version of MOVDDUP. +static bool isMOVDDUPYMask(ShuffleVectorSDNode *N, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + int NumElts = VT.getVectorNumElements(); + bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF; + + if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 || + !V2IsUndef || NumElts != 4) + return false; + + for (int i = 0; i != NumElts/2; ++i) + if (!isUndefOrEqual(N->getMaskElt(i), 0)) + return false; + for (int i = NumElts/2; i != NumElts; ++i) + if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2)) + return false; + return true; +} + +/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand +/// specifies a shuffle of elements that is suitable for input to 128-bit +/// version of MOVDDUP. +bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { + EVT VT = N->getValueType(0); + + if (VT.getSizeInBits() != 128) + return false; + + int e = VT.getVectorNumElements() / 2; + for (int i = 0; i < e; ++i) + if (!isUndefOrEqual(N->getMaskElt(i), i)) + return false; + for (int i = 0; i < e; ++i) + if (!isUndefOrEqual(N->getMaskElt(e+i), i)) + return false; + return true; +} + +/// isVEXTRACTF128Index - Return true if the specified +/// EXTRACT_SUBVECTOR operand specifies a vector extract that is +/// suitable for input to VEXTRACTF128. +bool X86::isVEXTRACTF128Index(SDNode *N) { + if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) + return false; + + // The index should be aligned on a 128-bit boundary. + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); + + unsigned VL = N->getValueType(0).getVectorNumElements(); + unsigned VBits = N->getValueType(0).getSizeInBits(); + unsigned ElSize = VBits / VL; + bool Result = (Index * ElSize) % 128 == 0; + + return Result; +} + +/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR +/// operand specifies a subvector insert that is suitable for input to +/// VINSERTF128. +bool X86::isVINSERTF128Index(SDNode *N) { + if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) + return false; + + // The index should be aligned on a 128-bit boundary. + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); + + unsigned VL = N->getValueType(0).getVectorNumElements(); + unsigned VBits = N->getValueType(0).getSizeInBits(); + unsigned ElSize = VBits / VL; + bool Result = (Index * ElSize) % 128 == 0; + + return Result; +} + +/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. +unsigned X86::getShuffleSHUFImmediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + int NumOperands = SVOp->getValueType(0).getVectorNumElements(); + + unsigned Shift = (NumOperands == 4) ? 2 : 1; + unsigned Mask = 0; + for (int i = 0; i < NumOperands; ++i) { + int Val = SVOp->getMaskElt(NumOperands-i-1); + if (Val < 0) Val = 0; + if (Val >= NumOperands) Val -= NumOperands; + Mask |= Val; + if (i != NumOperands - 1) + Mask <<= Shift; + } + return Mask; +} + +/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. +unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + unsigned Mask = 0; + // 8 nodes, but we only care about the last 4. + for (unsigned i = 7; i >= 4; --i) { + int Val = SVOp->getMaskElt(i); + if (Val >= 0) + Mask |= (Val - 4); + if (i != 4) + Mask <<= 2; + } + return Mask; +} + +/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. +unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + unsigned Mask = 0; + // 8 nodes, but we only care about the first 4. + for (int i = 3; i >= 0; --i) { + int Val = SVOp->getMaskElt(i); + if (Val >= 0) + Mask |= Val; + if (i != 0) + Mask <<= 2; + } + return Mask; +} + +/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle +/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. +unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + EVT VVT = N->getValueType(0); + unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; + int Val = 0; + + unsigned i, e; + for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { + Val = SVOp->getMaskElt(i); + if (Val >= 0) + break; + } + assert(Val - i > 0 && "PALIGNR imm should be positive"); + return (Val - i) * EltSize; +} + +/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate +/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 +/// instructions. +unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) { + if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) + llvm_unreachable("Illegal extract subvector for VEXTRACTF128"); + + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); + + EVT VecVT = N->getOperand(0).getValueType(); + EVT ElVT = VecVT.getVectorElementType(); + + unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); + return Index / NumElemsPerChunk; +} + +/// getInsertVINSERTF128Immediate - Return the appropriate immediate +/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 +/// instructions. +unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) { + if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) + llvm_unreachable("Illegal insert subvector for VINSERTF128"); + + uint64_t Index = + cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); + + EVT VecVT = N->getValueType(0); + EVT ElVT = VecVT.getVectorElementType(); + + unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits(); + return Index / NumElemsPerChunk; +} + +/// isZeroNode - Returns true if Elt is a constant zero or a floating point +/// constant +0.0. +bool X86::isZeroNode(SDValue Elt) { + return ((isa<ConstantSDNode>(Elt) && + cast<ConstantSDNode>(Elt)->isNullValue()) || + (isa<ConstantFPSDNode>(Elt) && + cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); +} + +/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in +/// their permute mask. +static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, + SelectionDAG &DAG) { + EVT VT = SVOp->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 8> MaskVec; + + for (unsigned i = 0; i != NumElems; ++i) { + int idx = SVOp->getMaskElt(i); + if (idx < 0) + MaskVec.push_back(idx); + else if (idx < (int)NumElems) + MaskVec.push_back(idx + NumElems); + else + MaskVec.push_back(idx - NumElems); + } + return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), + SVOp->getOperand(0), &MaskVec[0]); +} + +/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming +/// the two vector operands have swapped position. +static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { + unsigned NumElems = VT.getVectorNumElements(); + for (unsigned i = 0; i != NumElems; ++i) { + int idx = Mask[i]; + if (idx < 0) + continue; + else if (idx < (int)NumElems) + Mask[i] = idx + NumElems; + else + Mask[i] = idx - NumElems; + } +} + +/// ShouldXformToMOVHLPS - Return true if the node should be transformed to +/// match movhlps. The lower half elements should come from upper half of +/// V1 (and in order), and the upper half elements should come from the upper +/// half of V2 (and in order). +static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { + EVT VT = Op->getValueType(0); + if (VT.getSizeInBits() != 128) + return false; + if (VT.getVectorNumElements() != 4) + return false; + for (unsigned i = 0, e = 2; i != e; ++i) + if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) + return false; + for (unsigned i = 2; i != 4; ++i) + if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) + return false; + return true; +} + +/// isScalarLoadToVector - Returns true if the node is a scalar load that +/// is promoted to a vector. It also returns the LoadSDNode by reference if +/// required. +static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { + if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) + return false; + N = N->getOperand(0).getNode(); + if (!ISD::isNON_EXTLoad(N)) + return false; + if (LD) + *LD = cast<LoadSDNode>(N); + return true; +} + +/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to +/// match movlp{s|d}. The lower half elements should come from lower half of +/// V1 (and in order), and the upper half elements should come from the upper +/// half of V2 (and in order). And since V1 will become the source of the +/// MOVLP, it must be either a vector load or a scalar load to vector. +static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, + ShuffleVectorSDNode *Op) { + EVT VT = Op->getValueType(0); + if (VT.getSizeInBits() != 128) + return false; + + if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) + return false; + // Is V2 is a vector load, don't do this transformation. We will try to use + // load folding shufps op. + if (ISD::isNON_EXTLoad(V2)) + return false; + + unsigned NumElems = VT.getVectorNumElements(); + + if (NumElems != 2 && NumElems != 4) + return false; + for (unsigned i = 0, e = NumElems/2; i != e; ++i) + if (!isUndefOrEqual(Op->getMaskElt(i), i)) + return false; + for (unsigned i = NumElems/2; i != NumElems; ++i) + if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) + return false; + return true; +} + +/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are +/// all the same. +static bool isSplatVector(SDNode *N) { + if (N->getOpcode() != ISD::BUILD_VECTOR) + return false; + + SDValue SplatValue = N->getOperand(0); + for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) + if (N->getOperand(i) != SplatValue) + return false; + return true; +} + +/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved +/// to an zero vector. +/// FIXME: move to dag combiner / method on ShuffleVectorSDNode +static bool isZeroShuffle(ShuffleVectorSDNode *N) { + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + unsigned NumElems = N->getValueType(0).getVectorNumElements(); + for (unsigned i = 0; i != NumElems; ++i) { + int Idx = N->getMaskElt(i); + if (Idx >= (int)NumElems) { + unsigned Opc = V2.getOpcode(); + if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) + continue; + if (Opc != ISD::BUILD_VECTOR || + !X86::isZeroNode(V2.getOperand(Idx-NumElems))) + return false; + } else if (Idx >= 0) { + unsigned Opc = V1.getOpcode(); + if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) + continue; + if (Opc != ISD::BUILD_VECTOR || + !X86::isZeroNode(V1.getOperand(Idx))) + return false; + } + } + return true; +} + +/// getZeroVector - Returns a vector of specified type with all zero elements. +/// +static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG, + DebugLoc dl) { + assert(VT.isVector() && "Expected a vector type"); + + // Always build SSE zero vectors as <4 x i32> bitcasted + // to their dest type. This ensures they get CSE'd. + SDValue Vec; + if (VT.getSizeInBits() == 128) { // SSE + if (HasXMMInt) { // SSE2 + SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); + } else { // SSE1 + SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); + } + } else if (VT.getSizeInBits() == 256) { // AVX + // 256-bit logic and arithmetic instructions in AVX are + // all floating-point, no support for integer ops. Default + // to emitting fp zeroed vectors then. + SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); + } + return DAG.getNode(ISD::BITCAST, dl, VT, Vec); +} + +/// getOnesVector - Returns a vector of specified type with all bits set. +/// Always build ones vectors as <4 x i32>. For 256-bit types, use two +/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their +/// original type, ensuring they get CSE'd. +static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { + assert(VT.isVector() && "Expected a vector type"); + assert((VT.is128BitVector() || VT.is256BitVector()) + && "Expected a 128-bit or 256-bit vector type"); + + SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); + SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, + Cst, Cst, Cst, Cst); + + if (VT.is256BitVector()) { + SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32), + Vec, DAG.getConstant(0, MVT::i32), DAG, dl); + Vec = Insert128BitVector(InsV, Vec, + DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl); + } + + return DAG.getNode(ISD::BITCAST, dl, VT, Vec); +} + +/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements +/// that point to V2 points to its first element. +static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { + EVT VT = SVOp->getValueType(0); + unsigned NumElems = VT.getVectorNumElements(); + + bool Changed = false; + SmallVector<int, 8> MaskVec; + SVOp->getMask(MaskVec); + + for (unsigned i = 0; i != NumElems; ++i) { + if (MaskVec[i] > (int)NumElems) { + MaskVec[i] = NumElems; + Changed = true; + } + } + if (Changed) + return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), + SVOp->getOperand(1), &MaskVec[0]); + return SDValue(SVOp, 0); +} + +/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd +/// operation of specified width. +static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, + SDValue V2) { + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 8> Mask; + Mask.push_back(NumElems); + for (unsigned i = 1; i != NumElems; ++i) + Mask.push_back(i); + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); +} + +/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. +static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, + SDValue V2) { + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 8> Mask; + for (unsigned i = 0, e = NumElems/2; i != e; ++i) { + Mask.push_back(i); + Mask.push_back(i + NumElems); + } + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); +} + +/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. +static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, + SDValue V2) { + unsigned NumElems = VT.getVectorNumElements(); + unsigned Half = NumElems/2; + SmallVector<int, 8> Mask; + for (unsigned i = 0; i != Half; ++i) { + Mask.push_back(i + Half); + Mask.push_back(i + NumElems + Half); + } + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); +} + +// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by +// a generic shuffle instruction because the target has no such instructions. +// Generate shuffles which repeat i16 and i8 several times until they can be +// represented by v4f32 and then be manipulated by target suported shuffles. +static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { + EVT VT = V.getValueType(); + int NumElems = VT.getVectorNumElements(); + DebugLoc dl = V.getDebugLoc(); + + while (NumElems > 4) { + if (EltNo < NumElems/2) { + V = getUnpackl(DAG, dl, VT, V, V); + } else { + V = getUnpackh(DAG, dl, VT, V, V); + EltNo -= NumElems/2; + } + NumElems >>= 1; + } + return V; +} + +/// getLegalSplat - Generate a legal splat with supported x86 shuffles +static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { + EVT VT = V.getValueType(); + DebugLoc dl = V.getDebugLoc(); + assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) + && "Vector size not supported"); + + if (VT.getSizeInBits() == 128) { + V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); + int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; + V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), + &SplatMask[0]); + } else { + // To use VPERMILPS to splat scalars, the second half of indicies must + // refer to the higher part, which is a duplication of the lower one, + // because VPERMILPS can only handle in-lane permutations. + int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, + EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; + + V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); + V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), + &SplatMask[0]); + } + + return DAG.getNode(ISD::BITCAST, dl, VT, V); +} + +/// PromoteSplat - Splat is promoted to target supported vector shuffles. +static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { + EVT SrcVT = SV->getValueType(0); + SDValue V1 = SV->getOperand(0); + DebugLoc dl = SV->getDebugLoc(); + + int EltNo = SV->getSplatIndex(); + int NumElems = SrcVT.getVectorNumElements(); + unsigned Size = SrcVT.getSizeInBits(); + + assert(((Size == 128 && NumElems > 4) || Size == 256) && + "Unknown how to promote splat for type"); + + // Extract the 128-bit part containing the splat element and update + // the splat element index when it refers to the higher register. + if (Size == 256) { + unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0; + V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl); + if (Idx > 0) + EltNo -= NumElems/2; + } + + // All i16 and i8 vector types can't be used directly by a generic shuffle + // instruction because the target has no such instruction. Generate shuffles + // which repeat i16 and i8 several times until they fit in i32, and then can + // be manipulated by target suported shuffles. + EVT EltVT = SrcVT.getVectorElementType(); + if (EltVT == MVT::i8 || EltVT == MVT::i16) + V1 = PromoteSplati8i16(V1, DAG, EltNo); + + // Recreate the 256-bit vector and place the same 128-bit vector + // into the low and high part. This is necessary because we want + // to use VPERM* to shuffle the vectors + if (Size == 256) { + SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1, + DAG.getConstant(0, MVT::i32), DAG, dl); + V1 = Insert128BitVector(InsV, V1, + DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); + } + + return getLegalSplat(DAG, V1, EltNo); +} + +/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified +/// vector of zero or undef vector. This produces a shuffle where the low +/// element of V2 is swizzled into the zero/undef vector, landing at element +/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). +static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, + bool isZero, bool HasXMMInt, + SelectionDAG &DAG) { + EVT VT = V2.getValueType(); + SDValue V1 = isZero + ? getZeroVector(VT, HasXMMInt, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 16> MaskVec; + for (unsigned i = 0; i != NumElems; ++i) + // If this is the insertion idx, put the low elt of V2 here. + MaskVec.push_back(i == Idx ? NumElems : i); + return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); +} + +/// getShuffleScalarElt - Returns the scalar element that will make up the ith +/// element of the result of the vector shuffle. +static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, + unsigned Depth) { + if (Depth == 6) + return SDValue(); // Limit search depth. + + SDValue V = SDValue(N, 0); + EVT VT = V.getValueType(); + unsigned Opcode = V.getOpcode(); + + // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. + if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { + Index = SV->getMaskElt(Index); + + if (Index < 0) + return DAG.getUNDEF(VT.getVectorElementType()); + + int NumElems = VT.getVectorNumElements(); + SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); + return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); + } + + // Recurse into target specific vector shuffles to find scalars. + if (isTargetShuffle(Opcode)) { + int NumElems = VT.getVectorNumElements(); + SmallVector<unsigned, 16> ShuffleMask; + SDValue ImmN; + + switch(Opcode) { + case X86ISD::SHUFPS: + case X86ISD::SHUFPD: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeSHUFPSMask(NumElems, + cast<ConstantSDNode>(ImmN)->getZExtValue(), + ShuffleMask); + break; + case X86ISD::PUNPCKHBW: + case X86ISD::PUNPCKHWD: + case X86ISD::PUNPCKHDQ: + case X86ISD::PUNPCKHQDQ: + DecodePUNPCKHMask(NumElems, ShuffleMask); + break; + case X86ISD::UNPCKHPS: + case X86ISD::UNPCKHPD: + case X86ISD::VUNPCKHPSY: + case X86ISD::VUNPCKHPDY: + DecodeUNPCKHPMask(NumElems, ShuffleMask); + break; + case X86ISD::PUNPCKLBW: + case X86ISD::PUNPCKLWD: + case X86ISD::PUNPCKLDQ: + case X86ISD::PUNPCKLQDQ: + DecodePUNPCKLMask(VT, ShuffleMask); + break; + case X86ISD::UNPCKLPS: + case X86ISD::UNPCKLPD: + case X86ISD::VUNPCKLPSY: + case X86ISD::VUNPCKLPDY: + DecodeUNPCKLPMask(VT, ShuffleMask); + break; + case X86ISD::MOVHLPS: + DecodeMOVHLPSMask(NumElems, ShuffleMask); + break; + case X86ISD::MOVLHPS: + DecodeMOVLHPSMask(NumElems, ShuffleMask); + break; + case X86ISD::PSHUFD: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFMask(NumElems, + cast<ConstantSDNode>(ImmN)->getZExtValue(), + ShuffleMask); + break; + case X86ISD::PSHUFHW: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), + ShuffleMask); + break; + case X86ISD::PSHUFLW: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), + ShuffleMask); + break; + case X86ISD::MOVSS: + case X86ISD::MOVSD: { + // The index 0 always comes from the first element of the second source, + // this is why MOVSS and MOVSD are used in the first place. The other + // elements come from the other positions of the first source vector. + unsigned OpNum = (Index == 0) ? 1 : 0; + return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, + Depth+1); + } + case X86ISD::VPERMILPS: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), + ShuffleMask); + break; + case X86ISD::VPERMILPSY: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(), + ShuffleMask); + break; + case X86ISD::VPERMILPD: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(), + ShuffleMask); + break; + case X86ISD::VPERMILPDY: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(), + ShuffleMask); + break; + case X86ISD::VPERM2F128: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), + ShuffleMask); + break; + case X86ISD::MOVDDUP: + case X86ISD::MOVLHPD: + case X86ISD::MOVLPD: + case X86ISD::MOVLPS: + case X86ISD::MOVSHDUP: + case X86ISD::MOVSLDUP: + case X86ISD::PALIGN: + return SDValue(); // Not yet implemented. + default: + assert(0 && "unknown target shuffle node"); + return SDValue(); + } + + Index = ShuffleMask[Index]; + if (Index < 0) + return DAG.getUNDEF(VT.getVectorElementType()); + + SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); + return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, + Depth+1); + } + + // Actual nodes that may contain scalar elements + if (Opcode == ISD::BITCAST) { + V = V.getOperand(0); + EVT SrcVT = V.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) + return SDValue(); + } + + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) + return (Index == 0) ? V.getOperand(0) + : DAG.getUNDEF(VT.getVectorElementType()); + + if (V.getOpcode() == ISD::BUILD_VECTOR) + return V.getOperand(Index); + + return SDValue(); +} + +/// getNumOfConsecutiveZeros - Return the number of elements of a vector +/// shuffle operation which come from a consecutively from a zero. The +/// search can start in two different directions, from left or right. +static +unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, + bool ZerosFromLeft, SelectionDAG &DAG) { + int i = 0; + + while (i < NumElems) { + unsigned Index = ZerosFromLeft ? i : NumElems-i-1; + SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); + if (!(Elt.getNode() && + (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) + break; + ++i; + } + + return i; +} + +/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to +/// MaskE correspond consecutively to elements from one of the vector operands, +/// starting from its index OpIdx. Also tell OpNum which source vector operand. +static +bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, + int OpIdx, int NumElems, unsigned &OpNum) { + bool SeenV1 = false; + bool SeenV2 = false; + + for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { + int Idx = SVOp->getMaskElt(i); + // Ignore undef indicies + if (Idx < 0) + continue; + + if (Idx < NumElems) + SeenV1 = true; + else + SeenV2 = true; + + // Only accept consecutive elements from the same vector + if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) + return false; + } + + OpNum = SeenV1 ? 0 : 1; + return true; +} + +/// isVectorShiftRight - Returns true if the shuffle can be implemented as a +/// logical left shift of a vector. +static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, + bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { + unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); + unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, + false /* check zeros from right */, DAG); + unsigned OpSrc; + + if (!NumZeros) + return false; + + // Considering the elements in the mask that are not consecutive zeros, + // check if they consecutively come from only one of the source vectors. + // + // V1 = {X, A, B, C} 0 + // \ \ \ / + // vector_shuffle V1, V2 <1, 2, 3, X> + // + if (!isShuffleMaskConsecutive(SVOp, + 0, // Mask Start Index + NumElems-NumZeros-1, // Mask End Index + NumZeros, // Where to start looking in the src vector + NumElems, // Number of elements in vector + OpSrc)) // Which source operand ? + return false; + + isLeft = false; + ShAmt = NumZeros; + ShVal = SVOp->getOperand(OpSrc); + return true; +} + +/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a +/// logical left shift of a vector. +static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, + bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { + unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); + unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, + true /* check zeros from left */, DAG); + unsigned OpSrc; + + if (!NumZeros) + return false; + + // Considering the elements in the mask that are not consecutive zeros, + // check if they consecutively come from only one of the source vectors. + // + // 0 { A, B, X, X } = V2 + // / \ / / + // vector_shuffle V1, V2 <X, X, 4, 5> + // + if (!isShuffleMaskConsecutive(SVOp, + NumZeros, // Mask Start Index + NumElems-1, // Mask End Index + 0, // Where to start looking in the src vector + NumElems, // Number of elements in vector + OpSrc)) // Which source operand ? + return false; + + isLeft = true; + ShAmt = NumZeros; + ShVal = SVOp->getOperand(OpSrc); + return true; +} + +/// isVectorShift - Returns true if the shuffle can be implemented as a +/// logical left or right shift of a vector. +static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, + bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { + // Although the logic below support any bitwidth size, there are no + // shift instructions which handle more than 128-bit vectors. + if (SVOp->getValueType(0).getSizeInBits() > 128) + return false; + + if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || + isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) + return true; + + return false; +} + +/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. +/// +static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, + unsigned NumNonZero, unsigned NumZero, + SelectionDAG &DAG, + const TargetLowering &TLI) { + if (NumNonZero > 8) + return SDValue(); + + DebugLoc dl = Op.getDebugLoc(); + SDValue V(0, 0); + bool First = true; + for (unsigned i = 0; i < 16; ++i) { + bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; + if (ThisIsNonZero && First) { + if (NumZero) + V = getZeroVector(MVT::v8i16, true, DAG, dl); + else + V = DAG.getUNDEF(MVT::v8i16); + First = false; + } + + if ((i & 1) != 0) { + SDValue ThisElt(0, 0), LastElt(0, 0); + bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; + if (LastIsNonZero) { + LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, + MVT::i16, Op.getOperand(i-1)); + } + if (ThisIsNonZero) { + ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); + ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, + ThisElt, DAG.getConstant(8, MVT::i8)); + if (LastIsNonZero) + ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); + } else + ThisElt = LastElt; + + if (ThisElt.getNode()) + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, + DAG.getIntPtrConstant(i/2)); + } + } + + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); +} + +/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. +/// +static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, + unsigned NumNonZero, unsigned NumZero, + SelectionDAG &DAG, + const TargetLowering &TLI) { + if (NumNonZero > 4) + return SDValue(); + + DebugLoc dl = Op.getDebugLoc(); + SDValue V(0, 0); + bool First = true; + for (unsigned i = 0; i < 8; ++i) { + bool isNonZero = (NonZeros & (1 << i)) != 0; + if (isNonZero) { + if (First) { + if (NumZero) + V = getZeroVector(MVT::v8i16, true, DAG, dl); + else + V = DAG.getUNDEF(MVT::v8i16); + First = false; + } + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, + MVT::v8i16, V, Op.getOperand(i), + DAG.getIntPtrConstant(i)); + } + } + + return V; +} + +/// getVShift - Return a vector logical shift node. +/// +static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, + unsigned NumBits, SelectionDAG &DAG, + const TargetLowering &TLI, DebugLoc dl) { + assert(VT.getSizeInBits() == 128 && "Unknown type for VShift"); + EVT ShVT = MVT::v2i64; + unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; + SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(Opc, dl, ShVT, SrcOp, + DAG.getConstant(NumBits, + TLI.getShiftAmountTy(SrcOp.getValueType())))); +} + +SDValue +X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, + SelectionDAG &DAG) const { + + // Check if the scalar load can be widened into a vector load. And if + // the address is "base + cst" see if the cst can be "absorbed" into + // the shuffle mask. + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { + SDValue Ptr = LD->getBasePtr(); + if (!ISD::isNormalLoad(LD) || LD->isVolatile()) + return SDValue(); + EVT PVT = LD->getValueType(0); + if (PVT != MVT::i32 && PVT != MVT::f32) + return SDValue(); + + int FI = -1; + int64_t Offset = 0; + if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { + FI = FINode->getIndex(); + Offset = 0; + } else if (DAG.isBaseWithConstantOffset(Ptr) && + isa<FrameIndexSDNode>(Ptr.getOperand(0))) { + FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); + Offset = Ptr.getConstantOperandVal(1); + Ptr = Ptr.getOperand(0); + } else { + return SDValue(); + } + + // FIXME: 256-bit vector instructions don't require a strict alignment, + // improve this code to support it better. + unsigned RequiredAlign = VT.getSizeInBits()/8; + SDValue Chain = LD->getChain(); + // Make sure the stack object alignment is at least 16 or 32. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { + if (MFI->isFixedObjectIndex(FI)) { + // Can't change the alignment. FIXME: It's possible to compute + // the exact stack offset and reference FI + adjust offset instead. + // If someone *really* cares about this. That's the way to implement it. + return SDValue(); + } else { + MFI->setObjectAlignment(FI, RequiredAlign); + } + } + + // (Offset % 16 or 32) must be multiple of 4. Then address is then + // Ptr + (Offset & ~15). + if (Offset < 0) + return SDValue(); + if ((Offset % RequiredAlign) & 3) + return SDValue(); + int64_t StartOffset = Offset & ~(RequiredAlign-1); + if (StartOffset) + Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), + Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); + + int EltNo = (Offset - StartOffset) >> 2; + int NumElems = VT.getVectorNumElements(); + + EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32; + EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); + SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, + LD->getPointerInfo().getWithOffset(StartOffset), + false, false, 0); + + // Canonicalize it to a v4i32 or v8i32 shuffle. + SmallVector<int, 8> Mask; + for (int i = 0; i < NumElems; ++i) + Mask.push_back(EltNo); + + V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1); + return DAG.getNode(ISD::BITCAST, dl, NVT, + DAG.getVectorShuffle(CanonVT, dl, V1, + DAG.getUNDEF(CanonVT),&Mask[0])); + } + + return SDValue(); +} + +/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a +/// vector of type 'VT', see if the elements can be replaced by a single large +/// load which has the same value as a build_vector whose operands are 'elts'. +/// +/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a +/// +/// FIXME: we'd also like to handle the case where the last elements are zero +/// rather than undef via VZEXT_LOAD, but we do not detect that case today. +/// There's even a handy isZeroNode for that purpose. +static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, + DebugLoc &DL, SelectionDAG &DAG) { + EVT EltVT = VT.getVectorElementType(); + unsigned NumElems = Elts.size(); + + LoadSDNode *LDBase = NULL; + unsigned LastLoadedElt = -1U; + + // For each element in the initializer, see if we've found a load or an undef. + // If we don't find an initial load element, or later load elements are + // non-consecutive, bail out. + for (unsigned i = 0; i < NumElems; ++i) { + SDValue Elt = Elts[i]; + + if (!Elt.getNode() || + (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) + return SDValue(); + if (!LDBase) { + if (Elt.getNode()->getOpcode() == ISD::UNDEF) + return SDValue(); + LDBase = cast<LoadSDNode>(Elt.getNode()); + LastLoadedElt = i; + continue; + } + if (Elt.getOpcode() == ISD::UNDEF) + continue; + + LoadSDNode *LD = cast<LoadSDNode>(Elt); + if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) + return SDValue(); + LastLoadedElt = i; + } + + // If we have found an entire vector of loads and undefs, then return a large + // load of the entire vector width starting at the base pointer. If we found + // consecutive loads for the low half, generate a vzext_load node. + if (LastLoadedElt == NumElems - 1) { + if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) + return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getPointerInfo(), + LDBase->isVolatile(), LDBase->isNonTemporal(), 0); + return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), + LDBase->getPointerInfo(), + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->getAlignment()); + } else if (NumElems == 4 && LastLoadedElt == 1 && + DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { + SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64, + LDBase->getPointerInfo(), + LDBase->getAlignment(), + false/*isVolatile*/, true/*ReadMem*/, + false/*WriteMem*/); + return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); + } + return SDValue(); +} + +SDValue +X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + + EVT VT = Op.getValueType(); + EVT ExtVT = VT.getVectorElementType(); + unsigned NumElems = Op.getNumOperands(); + + // Vectors containing all zeros can be matched by pxor and xorps later + if (ISD::isBuildVectorAllZeros(Op.getNode())) { + // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd + // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. + if (Op.getValueType() == MVT::v4i32 || + Op.getValueType() == MVT::v8i32) + return Op; + + return getZeroVector(Op.getValueType(), Subtarget->hasXMMInt(), DAG, dl); + } + + // Vectors containing all ones can be matched by pcmpeqd on 128-bit width + // vectors or broken into v4i32 operations on 256-bit vectors. + if (ISD::isBuildVectorAllOnes(Op.getNode())) { + if (Op.getValueType() == MVT::v4i32) + return Op; + + return getOnesVector(Op.getValueType(), DAG, dl); + } + + unsigned EVTBits = ExtVT.getSizeInBits(); + + unsigned NumZero = 0; + unsigned NumNonZero = 0; + unsigned NonZeros = 0; + bool IsAllConstants = true; + SmallSet<SDValue, 8> Values; + for (unsigned i = 0; i < NumElems; ++i) { + SDValue Elt = Op.getOperand(i); + if (Elt.getOpcode() == ISD::UNDEF) + continue; + Values.insert(Elt); + if (Elt.getOpcode() != ISD::Constant && + Elt.getOpcode() != ISD::ConstantFP) + IsAllConstants = false; + if (X86::isZeroNode(Elt)) + NumZero++; + else { + NonZeros |= (1 << i); + NumNonZero++; + } + } + + // All undef vector. Return an UNDEF. All zero vectors were handled above. + if (NumNonZero == 0) + return DAG.getUNDEF(VT); + + // Special case for single non-zero, non-undef, element. + if (NumNonZero == 1) { + unsigned Idx = CountTrailingZeros_32(NonZeros); + SDValue Item = Op.getOperand(Idx); + + // If this is an insertion of an i64 value on x86-32, and if the top bits of + // the value are obviously zero, truncate the value to i32 and do the + // insertion that way. Only do this if the value is non-constant or if the + // value is a constant being inserted into element 0. It is cheaper to do + // a constant pool load than it is to do a movd + shuffle. + if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && + (!IsAllConstants || Idx == 0)) { + if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { + // Handle SSE only. + assert(VT == MVT::v2i64 && "Expected an SSE value type!"); + EVT VecVT = MVT::v4i32; + unsigned VecElts = 4; + + // Truncate the value (which may itself be a constant) to i32, and + // convert it to a vector with movd (S2V+shuffle to zero extend). + Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, + Subtarget->hasXMMInt(), DAG); + + // Now we have our 32-bit value zero extended in the low element of + // a vector. If Idx != 0, swizzle it into place. + if (Idx != 0) { + SmallVector<int, 4> Mask; + Mask.push_back(Idx); + for (unsigned i = 1; i != VecElts; ++i) + Mask.push_back(i); + Item = DAG.getVectorShuffle(VecVT, dl, Item, + DAG.getUNDEF(Item.getValueType()), + &Mask[0]); + } + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item); + } + } + + // If we have a constant or non-constant insertion into the low element of + // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into + // the rest of the elements. This will be matched as movd/movq/movss/movsd + // depending on what the source datatype is. + if (Idx == 0) { + if (NumZero == 0) { + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); + } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || + (ExtVT == MVT::i64 && Subtarget->is64Bit())) { + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); + // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. + return getShuffleVectorZeroOrUndef(Item, 0, true,Subtarget->hasXMMInt(), + DAG); + } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { + Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); + assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); + EVT MiddleVT = MVT::v4i32; + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, + Subtarget->hasXMMInt(), DAG); + return DAG.getNode(ISD::BITCAST, dl, VT, Item); + } + } + + // Is it a vector logical left shift? + if (NumElems == 2 && Idx == 1 && + X86::isZeroNode(Op.getOperand(0)) && + !X86::isZeroNode(Op.getOperand(1))) { + unsigned NumBits = VT.getSizeInBits(); + return getVShift(true, VT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + VT, Op.getOperand(1)), + NumBits/2, DAG, *this, dl); + } + + if (IsAllConstants) // Otherwise, it's better to do a constpool load. + return SDValue(); + + // Otherwise, if this is a vector with i32 or f32 elements, and the element + // is a non-constant being inserted into an element other than the low one, + // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka + // movd/movss) to move this into the low element, then shuffle it into + // place. + if (EVTBits == 32) { + Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); + + // Turn it into a shuffle of zero and zero-extended scalar to vector. + Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, + Subtarget->hasXMMInt(), DAG); + SmallVector<int, 8> MaskVec; + for (unsigned i = 0; i < NumElems; i++) + MaskVec.push_back(i == Idx ? 0 : 1); + return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); + } + } + + // Splat is obviously ok. Let legalizer expand it to a shuffle. + if (Values.size() == 1) { + if (EVTBits == 32) { + // Instead of a shuffle like this: + // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> + // Check if it's possible to issue this instead. + // shuffle (vload ptr)), undef, <1, 1, 1, 1> + unsigned Idx = CountTrailingZeros_32(NonZeros); + SDValue Item = Op.getOperand(Idx); + if (Op.getNode()->isOnlyUserOf(Item.getNode())) + return LowerAsSplatVectorLoad(Item, VT, dl, DAG); + } + return SDValue(); + } + + // A vector full of immediates; various special cases are already + // handled, so this is best done with a single constant-pool load. + if (IsAllConstants) + return SDValue(); + + // For AVX-length vectors, build the individual 128-bit pieces and use + // shuffles to put them in place. + if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) { + SmallVector<SDValue, 32> V; + for (unsigned i = 0; i < NumElems; ++i) + V.push_back(Op.getOperand(i)); + + EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); + + // Build both the lower and upper subvector. + SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2); + SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2], + NumElems/2); + + // Recreate the wider vector with the lower and upper part. + SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower, + DAG.getConstant(0, MVT::i32), DAG, dl); + return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); + } + + // Let legalizer expand 2-wide build_vectors. + if (EVTBits == 64) { + if (NumNonZero == 1) { + // One half is zero or undef. + unsigned Idx = CountTrailingZeros_32(NonZeros); + SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, + Op.getOperand(Idx)); + return getShuffleVectorZeroOrUndef(V2, Idx, true, + Subtarget->hasXMMInt(), DAG); + } + return SDValue(); + } + + // If element VT is < 32 bits, convert it to inserts into a zero vector. + if (EVTBits == 8 && NumElems == 16) { + SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, + *this); + if (V.getNode()) return V; + } + + if (EVTBits == 16 && NumElems == 8) { + SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, + *this); + if (V.getNode()) return V; + } + + // If element VT is == 32 bits, turn it into a number of shuffles. + SmallVector<SDValue, 8> V; + V.resize(NumElems); + if (NumElems == 4 && NumZero > 0) { + for (unsigned i = 0; i < 4; ++i) { + bool isZero = !(NonZeros & (1 << i)); + if (isZero) + V[i] = getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl); + else + V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); + } + + for (unsigned i = 0; i < 2; ++i) { + switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { + default: break; + case 0: + V[i] = V[i*2]; // Must be a zero vector. + break; + case 1: + V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); + break; + case 2: + V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); + break; + case 3: + V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); + break; + } + } + + SmallVector<int, 8> MaskVec; + bool Reverse = (NonZeros & 0x3) == 2; + for (unsigned i = 0; i < 2; ++i) + MaskVec.push_back(Reverse ? 1-i : i); + Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; + for (unsigned i = 0; i < 2; ++i) + MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); + return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); + } + + if (Values.size() > 1 && VT.getSizeInBits() == 128) { + // Check for a build vector of consecutive loads. + for (unsigned i = 0; i < NumElems; ++i) + V[i] = Op.getOperand(i); + + // Check for elements which are consecutive loads. + SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); + if (LD.getNode()) + return LD; + + // For SSE 4.1, use insertps to put the high elements into the low element. + if (getSubtarget()->hasSSE41() || getSubtarget()->hasAVX()) { + SDValue Result; + if (Op.getOperand(0).getOpcode() != ISD::UNDEF) + Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); + else + Result = DAG.getUNDEF(VT); + + for (unsigned i = 1; i < NumElems; ++i) { + if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, + Op.getOperand(i), DAG.getIntPtrConstant(i)); + } + return Result; + } + + // Otherwise, expand into a number of unpckl*, start by extending each of + // our (non-undef) elements to the full vector width with the element in the + // bottom slot of the vector (which generates no code for SSE). + for (unsigned i = 0; i < NumElems; ++i) { + if (Op.getOperand(i).getOpcode() != ISD::UNDEF) + V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); + else + V[i] = DAG.getUNDEF(VT); + } + + // Next, we iteratively mix elements, e.g. for v4f32: + // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> + // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> + // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> + unsigned EltStride = NumElems >> 1; + while (EltStride != 0) { + for (unsigned i = 0; i < EltStride; ++i) { + // If V[i+EltStride] is undef and this is the first round of mixing, + // then it is safe to just drop this shuffle: V[i] is already in the + // right place, the one element (since it's the first round) being + // inserted as undef can be dropped. This isn't safe for successive + // rounds because they will permute elements within both vectors. + if (V[i+EltStride].getOpcode() == ISD::UNDEF && + EltStride == NumElems/2) + continue; + + V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); + } + EltStride >>= 1; + } + return V[0]; + } + return SDValue(); +} + +// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place +// them in a MMX register. This is better than doing a stack convert. +static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); + EVT ResVT = Op.getValueType(); + + assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || + ResVT == MVT::v8i16 || ResVT == MVT::v16i8); + int Mask[2]; + SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0)); + SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); + InVec = Op.getOperand(1); + if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { + unsigned NumElts = ResVT.getVectorNumElements(); + VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); + VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, + InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); + } else { + InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec); + SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); + Mask[0] = 0; Mask[1] = 2; + VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); + } + return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp); +} + +// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction +// to create 256-bit vectors from two other 128-bit ones. +static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); + EVT ResVT = Op.getValueType(); + + assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide"); + + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + unsigned NumElems = ResVT.getVectorNumElements(); + + SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1, + DAG.getConstant(0, MVT::i32), DAG, dl); + return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); +} + +SDValue +X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { + EVT ResVT = Op.getValueType(); + + assert(Op.getNumOperands() == 2); + assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) && + "Unsupported CONCAT_VECTORS for value type"); + + // We support concatenate two MMX registers and place them in a MMX register. + // This is better than doing a stack convert. + if (ResVT.is128BitVector()) + return LowerMMXCONCAT_VECTORS(Op, DAG); + + // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors + // from two other 128-bit ones. + return LowerAVXCONCAT_VECTORS(Op, DAG); +} + +// v8i16 shuffles - Prefer shuffles in the following order: +// 1. [all] pshuflw, pshufhw, optional move +// 2. [ssse3] 1 x pshufb +// 3. [ssse3] 2 x pshufb + 1 x por +// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) +SDValue +X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, + SelectionDAG &DAG) const { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + DebugLoc dl = SVOp->getDebugLoc(); + SmallVector<int, 8> MaskVals; + + // Determine if more than 1 of the words in each of the low and high quadwords + // of the result come from the same quadword of one of the two inputs. Undef + // mask values count as coming from any quadword, for better codegen. + SmallVector<unsigned, 4> LoQuad(4); + SmallVector<unsigned, 4> HiQuad(4); + BitVector InputQuads(4); + for (unsigned i = 0; i < 8; ++i) { + SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; + int EltIdx = SVOp->getMaskElt(i); + MaskVals.push_back(EltIdx); + if (EltIdx < 0) { + ++Quad[0]; + ++Quad[1]; + ++Quad[2]; + ++Quad[3]; + continue; + } + ++Quad[EltIdx / 4]; + InputQuads.set(EltIdx / 4); + } + + int BestLoQuad = -1; + unsigned MaxQuad = 1; + for (unsigned i = 0; i < 4; ++i) { + if (LoQuad[i] > MaxQuad) { + BestLoQuad = i; + MaxQuad = LoQuad[i]; + } + } + + int BestHiQuad = -1; + MaxQuad = 1; + for (unsigned i = 0; i < 4; ++i) { + if (HiQuad[i] > MaxQuad) { + BestHiQuad = i; + MaxQuad = HiQuad[i]; + } + } + + // For SSSE3, If all 8 words of the result come from only 1 quadword of each + // of the two input vectors, shuffle them into one input vector so only a + // single pshufb instruction is necessary. If There are more than 2 input + // quads, disable the next transformation since it does not help SSSE3. + bool V1Used = InputQuads[0] || InputQuads[1]; + bool V2Used = InputQuads[2] || InputQuads[3]; + if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) { + if (InputQuads.count() == 2 && V1Used && V2Used) { + BestLoQuad = InputQuads.find_first(); + BestHiQuad = InputQuads.find_next(BestLoQuad); + } + if (InputQuads.count() > 2) { + BestLoQuad = -1; + BestHiQuad = -1; + } + } + + // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update + // the shuffle mask. If a quad is scored as -1, that means that it contains + // words from all 4 input quadwords. + SDValue NewV; + if (BestLoQuad >= 0 || BestHiQuad >= 0) { + SmallVector<int, 8> MaskV; + MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); + MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); + NewV = DAG.getVectorShuffle(MVT::v2i64, dl, + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); + NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); + + // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the + // source words for the shuffle, to aid later transformations. + bool AllWordsInNewV = true; + bool InOrder[2] = { true, true }; + for (unsigned i = 0; i != 8; ++i) { + int idx = MaskVals[i]; + if (idx != (int)i) + InOrder[i/4] = false; + if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) + continue; + AllWordsInNewV = false; + break; + } + + bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; + if (AllWordsInNewV) { + for (int i = 0; i != 8; ++i) { + int idx = MaskVals[i]; + if (idx < 0) + continue; + idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; + if ((idx != i) && idx < 4) + pshufhw = false; + if ((idx != i) && idx > 3) + pshuflw = false; + } + V1 = NewV; + V2Used = false; + BestLoQuad = 0; + BestHiQuad = 1; + } + + // If we've eliminated the use of V2, and the new mask is a pshuflw or + // pshufhw, that's as cheap as it gets. Return the new shuffle. + if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { + unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; + unsigned TargetMask = 0; + NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, + DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); + TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): + X86::getShufflePSHUFLWImmediate(NewV.getNode()); + V1 = NewV.getOperand(0); + return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); + } + } + + // If we have SSSE3, and all words of the result are from 1 input vector, + // case 2 is generated, otherwise case 3 is generated. If no SSSE3 + // is present, fall back to case 4. + if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) { + SmallVector<SDValue,16> pshufbMask; + + // If we have elements from both input vectors, set the high bit of the + // shuffle mask element to zero out elements that come from V2 in the V1 + // mask, and elements that come from V1 in the V2 mask, so that the two + // results can be OR'd together. + bool TwoInputs = V1Used && V2Used; + for (unsigned i = 0; i != 8; ++i) { + int EltIdx = MaskVals[i] * 2; + if (TwoInputs && (EltIdx >= 16)) { + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + continue; + } + pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); + } + V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1); + V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v16i8, &pshufbMask[0], 16)); + if (!TwoInputs) + return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); + + // Calculate the shuffle mask for the second input, shuffle it, and + // OR it with the first shuffled input. + pshufbMask.clear(); + for (unsigned i = 0; i != 8; ++i) { + int EltIdx = MaskVals[i] * 2; + if (EltIdx < 16) { + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + continue; + } + pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); + pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); + } + V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2); + V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v16i8, &pshufbMask[0], 16)); + V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); + return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); + } + + // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, + // and update MaskVals with new element order. + BitVector InOrder(8); + if (BestLoQuad >= 0) { + SmallVector<int, 8> MaskV; + for (int i = 0; i != 4; ++i) { + int idx = MaskVals[i]; + if (idx < 0) { + MaskV.push_back(-1); + InOrder.set(i); + } else if ((idx / 4) == BestLoQuad) { + MaskV.push_back(idx & 3); + InOrder.set(i); + } else { + MaskV.push_back(-1); + } + } + for (unsigned i = 4; i != 8; ++i) + MaskV.push_back(i); + NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), + &MaskV[0]); + + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && + (Subtarget->hasSSSE3() || Subtarget->hasAVX())) + NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, + NewV.getOperand(0), + X86::getShufflePSHUFLWImmediate(NewV.getNode()), + DAG); + } + + // If BestHi >= 0, generate a pshufhw to put the high elements in order, + // and update MaskVals with the new element order. + if (BestHiQuad >= 0) { + SmallVector<int, 8> MaskV; + for (unsigned i = 0; i != 4; ++i) + MaskV.push_back(i); + for (unsigned i = 4; i != 8; ++i) { + int idx = MaskVals[i]; + if (idx < 0) { + MaskV.push_back(-1); + InOrder.set(i); + } else if ((idx / 4) == BestHiQuad) { + MaskV.push_back((idx & 3) + 4); + InOrder.set(i); + } else { + MaskV.push_back(-1); + } + } + NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), + &MaskV[0]); + + if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && + (Subtarget->hasSSSE3() || Subtarget->hasAVX())) + NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, + NewV.getOperand(0), + X86::getShufflePSHUFHWImmediate(NewV.getNode()), + DAG); + } + + // In case BestHi & BestLo were both -1, which means each quadword has a word + // from each of the four input quadwords, calculate the InOrder bitvector now + // before falling through to the insert/extract cleanup. + if (BestLoQuad == -1 && BestHiQuad == -1) { + NewV = V1; + for (int i = 0; i != 8; ++i) + if (MaskVals[i] < 0 || MaskVals[i] == i) + InOrder.set(i); + } + + // The other elements are put in the right place using pextrw and pinsrw. + for (unsigned i = 0; i != 8; ++i) { + if (InOrder[i]) + continue; + int EltIdx = MaskVals[i]; + if (EltIdx < 0) + continue; + SDValue ExtOp = (EltIdx < 8) + ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, + DAG.getIntPtrConstant(EltIdx)) + : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, + DAG.getIntPtrConstant(EltIdx - 8)); + NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, + DAG.getIntPtrConstant(i)); + } + return NewV; +} + +// v16i8 shuffles - Prefer shuffles in the following order: +// 1. [ssse3] 1 x pshufb +// 2. [ssse3] 2 x pshufb + 1 x por +// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw +static +SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, + SelectionDAG &DAG, + const X86TargetLowering &TLI) { + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + DebugLoc dl = SVOp->getDebugLoc(); + SmallVector<int, 16> MaskVals; + SVOp->getMask(MaskVals); + + // If we have SSSE3, case 1 is generated when all result bytes come from + // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is + // present, fall back to case 3. + // FIXME: kill V2Only once shuffles are canonizalized by getNode. + bool V1Only = true; + bool V2Only = true; + for (unsigned i = 0; i < 16; ++i) { + int EltIdx = MaskVals[i]; + if (EltIdx < 0) + continue; + if (EltIdx < 16) + V2Only = false; + else + V1Only = false; + } + + // If SSSE3, use 1 pshufb instruction per vector with elements in the result. + if (TLI.getSubtarget()->hasSSSE3() || TLI.getSubtarget()->hasAVX()) { + SmallVector<SDValue,16> pshufbMask; + + // If all result elements are from one input vector, then only translate + // undef mask values to 0x80 (zero out result) in the pshufb mask. + // + // Otherwise, we have elements from both input vectors, and must zero out + // elements that come from V2 in the first mask, and V1 in the second mask + // so that we can OR them together. + bool TwoInputs = !(V1Only || V2Only); + for (unsigned i = 0; i != 16; ++i) { + int EltIdx = MaskVals[i]; + if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + continue; + } + pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); + } + // If all the elements are from V2, assign it to V1 and return after + // building the first pshufb. + if (V2Only) + V1 = V2; + V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v16i8, &pshufbMask[0], 16)); + if (!TwoInputs) + return V1; + + // Calculate the shuffle mask for the second input, shuffle it, and + // OR it with the first shuffled input. + pshufbMask.clear(); + for (unsigned i = 0; i != 16; ++i) { + int EltIdx = MaskVals[i]; + if (EltIdx < 16) { + pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); + continue; + } + pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); + } + V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v16i8, &pshufbMask[0], 16)); + return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); + } + + // No SSSE3 - Calculate in place words and then fix all out of place words + // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from + // the 16 different words that comprise the two doublequadword input vectors. + V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); + SDValue NewV = V2Only ? V2 : V1; + for (int i = 0; i != 8; ++i) { + int Elt0 = MaskVals[i*2]; + int Elt1 = MaskVals[i*2+1]; + + // This word of the result is all undef, skip it. + if (Elt0 < 0 && Elt1 < 0) + continue; + + // This word of the result is already in the correct place, skip it. + if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) + continue; + if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) + continue; + + SDValue Elt0Src = Elt0 < 16 ? V1 : V2; + SDValue Elt1Src = Elt1 < 16 ? V1 : V2; + SDValue InsElt; + + // If Elt0 and Elt1 are defined, are consecutive, and can be load + // using a single extract together, load it and store it. + if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { + InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, + DAG.getIntPtrConstant(Elt1 / 2)); + NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, + DAG.getIntPtrConstant(i)); + continue; + } + + // If Elt1 is defined, extract it from the appropriate source. If the + // source byte is not also odd, shift the extracted word left 8 bits + // otherwise clear the bottom 8 bits if we need to do an or. + if (Elt1 >= 0) { + InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, + DAG.getIntPtrConstant(Elt1 / 2)); + if ((Elt1 & 1) == 0) + InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, + DAG.getConstant(8, + TLI.getShiftAmountTy(InsElt.getValueType()))); + else if (Elt0 >= 0) + InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, + DAG.getConstant(0xFF00, MVT::i16)); + } + // If Elt0 is defined, extract it from the appropriate source. If the + // source byte is not also even, shift the extracted word right 8 bits. If + // Elt1 was also defined, OR the extracted values together before + // inserting them in the result. + if (Elt0 >= 0) { + SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, + Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); + if ((Elt0 & 1) != 0) + InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, + DAG.getConstant(8, + TLI.getShiftAmountTy(InsElt0.getValueType()))); + else if (Elt1 >= 0) + InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, + DAG.getConstant(0x00FF, MVT::i16)); + InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) + : InsElt0; + } + NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, + DAG.getIntPtrConstant(i)); + } + return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); +} + +/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide +/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be +/// done when every pair / quad of shuffle mask elements point to elements in +/// the right sequence. e.g. +/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> +static +SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, + SelectionDAG &DAG, DebugLoc dl) { + EVT VT = SVOp->getValueType(0); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + unsigned NumElems = VT.getVectorNumElements(); + unsigned NewWidth = (NumElems == 4) ? 2 : 4; + EVT NewVT; + switch (VT.getSimpleVT().SimpleTy) { + default: assert(false && "Unexpected!"); + case MVT::v4f32: NewVT = MVT::v2f64; break; + case MVT::v4i32: NewVT = MVT::v2i64; break; + case MVT::v8i16: NewVT = MVT::v4i32; break; + case MVT::v16i8: NewVT = MVT::v4i32; break; + } + + int Scale = NumElems / NewWidth; + SmallVector<int, 8> MaskVec; + for (unsigned i = 0; i < NumElems; i += Scale) { + int StartIdx = -1; + for (int j = 0; j < Scale; ++j) { + int EltIdx = SVOp->getMaskElt(i+j); + if (EltIdx < 0) + continue; + if (StartIdx == -1) + StartIdx = EltIdx - (EltIdx % Scale); + if (EltIdx != StartIdx + j) + return SDValue(); + } + if (StartIdx == -1) + MaskVec.push_back(-1); + else + MaskVec.push_back(StartIdx / Scale); + } + + V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); + return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); +} + +/// getVZextMovL - Return a zero-extending vector move low node. +/// +static SDValue getVZextMovL(EVT VT, EVT OpVT, + SDValue SrcOp, SelectionDAG &DAG, + const X86Subtarget *Subtarget, DebugLoc dl) { + if (VT == MVT::v2f64 || VT == MVT::v4f32) { + LoadSDNode *LD = NULL; + if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) + LD = dyn_cast<LoadSDNode>(SrcOp); + if (!LD) { + // movssrr and movsdrr do not clear top bits. Try to use movd, movq + // instead. + MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; + if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && + SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && + SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && + SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { + // PR2108 + OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + OpVT, + SrcOp.getOperand(0) + .getOperand(0)))); + } + } + } + + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, + DAG.getNode(ISD::BITCAST, dl, + OpVT, SrcOp))); +} + +/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector +/// shuffle node referes to only one lane in the sources. +static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) { + EVT VT = SVOp->getValueType(0); + int NumElems = VT.getVectorNumElements(); + int HalfSize = NumElems/2; + SmallVector<int, 16> M; + SVOp->getMask(M); + bool MatchA = false, MatchB = false; + + for (int l = 0; l < NumElems*2; l += HalfSize) { + if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) { + MatchA = true; + break; + } + } + + for (int l = 0; l < NumElems*2; l += HalfSize) { + if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) { + MatchB = true; + break; + } + } + + return MatchA && MatchB; +} + +/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles +/// which could not be matched by any known target speficic shuffle +static SDValue +LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { + if (areShuffleHalvesWithinDisjointLanes(SVOp)) { + // If each half of a vector shuffle node referes to only one lane in the + // source vectors, extract each used 128-bit lane and shuffle them using + // 128-bit shuffles. Then, concatenate the results. Otherwise leave + // the work to the legalizer. + DebugLoc dl = SVOp->getDebugLoc(); + EVT VT = SVOp->getValueType(0); + int NumElems = VT.getVectorNumElements(); + int HalfSize = NumElems/2; + + // Extract the reference for each half + int FstVecExtractIdx = 0, SndVecExtractIdx = 0; + int FstVecOpNum = 0, SndVecOpNum = 0; + for (int i = 0; i < HalfSize; ++i) { + int Elt = SVOp->getMaskElt(i); + if (SVOp->getMaskElt(i) < 0) + continue; + FstVecOpNum = Elt/NumElems; + FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; + break; + } + for (int i = HalfSize; i < NumElems; ++i) { + int Elt = SVOp->getMaskElt(i); + if (SVOp->getMaskElt(i) < 0) + continue; + SndVecOpNum = Elt/NumElems; + SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize; + break; + } + + // Extract the subvectors + SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum), + DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl); + SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum), + DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl); + + // Generate 128-bit shuffles + SmallVector<int, 16> MaskV1, MaskV2; + for (int i = 0; i < HalfSize; ++i) { + int Elt = SVOp->getMaskElt(i); + MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize); + } + for (int i = HalfSize; i < NumElems; ++i) { + int Elt = SVOp->getMaskElt(i); + MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize); + } + + EVT NVT = V1.getValueType(); + V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]); + V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]); + + // Concatenate the result back + SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1, + DAG.getConstant(0, MVT::i32), DAG, dl); + return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); + } + + return SDValue(); +} + +/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with +/// 4 elements, and match them with several different shuffle types. +static SDValue +LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + DebugLoc dl = SVOp->getDebugLoc(); + EVT VT = SVOp->getValueType(0); + + assert(VT.getSizeInBits() == 128 && "Unsupported vector size"); + + SmallVector<std::pair<int, int>, 8> Locs; + Locs.resize(4); + SmallVector<int, 8> Mask1(4U, -1); + SmallVector<int, 8> PermMask; + SVOp->getMask(PermMask); + + unsigned NumHi = 0; + unsigned NumLo = 0; + for (unsigned i = 0; i != 4; ++i) { + int Idx = PermMask[i]; + if (Idx < 0) { + Locs[i] = std::make_pair(-1, -1); + } else { + assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); + if (Idx < 4) { + Locs[i] = std::make_pair(0, NumLo); + Mask1[NumLo] = Idx; + NumLo++; + } else { + Locs[i] = std::make_pair(1, NumHi); + if (2+NumHi < 4) + Mask1[2+NumHi] = Idx; + NumHi++; + } + } + } + + if (NumLo <= 2 && NumHi <= 2) { + // If no more than two elements come from either vector. This can be + // implemented with two shuffles. First shuffle gather the elements. + // The second shuffle, which takes the first shuffle as both of its + // vector operands, put the elements into the right order. + V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); + + SmallVector<int, 8> Mask2(4U, -1); + + for (unsigned i = 0; i != 4; ++i) { + if (Locs[i].first == -1) + continue; + else { + unsigned Idx = (i < 2) ? 0 : 4; + Idx += Locs[i].first * 2 + Locs[i].second; + Mask2[i] = Idx; + } + } + + return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); + } else if (NumLo == 3 || NumHi == 3) { + // Otherwise, we must have three elements from one vector, call it X, and + // one element from the other, call it Y. First, use a shufps to build an + // intermediate vector with the one element from Y and the element from X + // that will be in the same half in the final destination (the indexes don't + // matter). Then, use a shufps to build the final vector, taking the half + // containing the element from Y from the intermediate, and the other half + // from X. + if (NumHi == 3) { + // Normalize it so the 3 elements come from V1. + CommuteVectorShuffleMask(PermMask, VT); + std::swap(V1, V2); + } + + // Find the element from V2. + unsigned HiIndex; + for (HiIndex = 0; HiIndex < 3; ++HiIndex) { + int Val = PermMask[HiIndex]; + if (Val < 0) + continue; + if (Val >= 4) + break; + } + + Mask1[0] = PermMask[HiIndex]; + Mask1[1] = -1; + Mask1[2] = PermMask[HiIndex^1]; + Mask1[3] = -1; + V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); + + if (HiIndex >= 2) { + Mask1[0] = PermMask[0]; + Mask1[1] = PermMask[1]; + Mask1[2] = HiIndex & 1 ? 6 : 4; + Mask1[3] = HiIndex & 1 ? 4 : 6; + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); + } else { + Mask1[0] = HiIndex & 1 ? 2 : 0; + Mask1[1] = HiIndex & 1 ? 0 : 2; + Mask1[2] = PermMask[2]; + Mask1[3] = PermMask[3]; + if (Mask1[2] >= 0) + Mask1[2] += 4; + if (Mask1[3] >= 0) + Mask1[3] += 4; + return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); + } + } + + // Break it into (shuffle shuffle_hi, shuffle_lo). + Locs.clear(); + Locs.resize(4); + SmallVector<int,8> LoMask(4U, -1); + SmallVector<int,8> HiMask(4U, -1); + + SmallVector<int,8> *MaskPtr = &LoMask; + unsigned MaskIdx = 0; + unsigned LoIdx = 0; + unsigned HiIdx = 2; + for (unsigned i = 0; i != 4; ++i) { + if (i == 2) { + MaskPtr = &HiMask; + MaskIdx = 1; + LoIdx = 0; + HiIdx = 2; + } + int Idx = PermMask[i]; + if (Idx < 0) { + Locs[i] = std::make_pair(-1, -1); + } else if (Idx < 4) { + Locs[i] = std::make_pair(MaskIdx, LoIdx); + (*MaskPtr)[LoIdx] = Idx; + LoIdx++; + } else { + Locs[i] = std::make_pair(MaskIdx, HiIdx); + (*MaskPtr)[HiIdx] = Idx; + HiIdx++; + } + } + + SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); + SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); + SmallVector<int, 8> MaskOps; + for (unsigned i = 0; i != 4; ++i) { + if (Locs[i].first == -1) { + MaskOps.push_back(-1); + } else { + unsigned Idx = Locs[i].first * 4 + Locs[i].second; + MaskOps.push_back(Idx); + } + } + return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); +} + +static bool MayFoldVectorLoad(SDValue V) { + if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) + V = V.getOperand(0); + if (MayFoldLoad(V)) + return true; + return false; +} + +// FIXME: the version above should always be used. Since there's +// a bug where several vector shuffles can't be folded because the +// DAG is not updated during lowering and a node claims to have two +// uses while it only has one, use this version, and let isel match +// another instruction if the load really happens to have more than +// one use. Remove this version after this bug get fixed. +// rdar://8434668, PR8156 +static bool RelaxedMayFoldVectorLoad(SDValue V) { + if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) + V = V.getOperand(0); + if (ISD::isNormalLoad(V.getNode())) + return true; + return false; +} + +/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by +/// a vector extract, and if both can be later optimized into a single load. +/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked +/// here because otherwise a target specific shuffle node is going to be +/// emitted for this shuffle, and the optimization not done. +/// FIXME: This is probably not the best approach, but fix the problem +/// until the right path is decided. +static +bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, + const TargetLowering &TLI) { + EVT VT = V.getValueType(); + ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); + + // Be sure that the vector shuffle is present in a pattern like this: + // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) + if (!V.hasOneUse()) + return false; + + SDNode *N = *V.getNode()->use_begin(); + if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + + SDValue EltNo = N->getOperand(1); + if (!isa<ConstantSDNode>(EltNo)) + return false; + + // If the bit convert changed the number of elements, it is unsafe + // to examine the mask. + bool HasShuffleIntoBitcast = false; + if (V.getOpcode() == ISD::BITCAST) { + EVT SrcVT = V.getOperand(0).getValueType(); + if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) + return false; + V = V.getOperand(0); + HasShuffleIntoBitcast = true; + } + + // Select the input vector, guarding against out of range extract vector. + unsigned NumElems = VT.getVectorNumElements(); + unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); + int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); + V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); + + // Skip one more bit_convert if necessary + if (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + + if (ISD::isNormalLoad(V.getNode())) { + // Is the original load suitable? + LoadSDNode *LN0 = cast<LoadSDNode>(V); + + // FIXME: avoid the multi-use bug that is preventing lots of + // of foldings to be detected, this is still wrong of course, but + // give the temporary desired behavior, and if it happens that + // the load has real more uses, during isel it will not fold, and + // will generate poor code. + if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() + return false; + + if (!HasShuffleIntoBitcast) + return true; + + // If there's a bitcast before the shuffle, check if the load type and + // alignment is valid. + unsigned Align = LN0->getAlignment(); + unsigned NewAlign = + TLI.getTargetData()->getABITypeAlignment( + VT.getTypeForEVT(*DAG.getContext())); + + if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) + return false; + } + + return true; +} + +static +SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + // Canonizalize to v2f64. + V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1); + return DAG.getNode(ISD::BITCAST, dl, VT, + getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, + V1, DAG)); +} + +static +SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, + bool HasXMMInt) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + EVT VT = Op.getValueType(); + + assert(VT != MVT::v2i64 && "unsupported shuffle type"); + + if (HasXMMInt && VT == MVT::v2f64) + return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); + + // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1) + return DAG.getNode(ISD::BITCAST, dl, VT, + getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32, + DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1), + DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG)); +} + +static +SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + EVT VT = Op.getValueType(); + + assert((VT == MVT::v4i32 || VT == MVT::v4f32) && + "unsupported shuffle type"); + + if (V2.getOpcode() == ISD::UNDEF) + V2 = V1; + + // v4i32 or v4f32 + return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); +} + +static inline unsigned getSHUFPOpcode(EVT VT) { + switch(VT.getSimpleVT().SimpleTy) { + case MVT::v8i32: // Use fp unit for int unpack. + case MVT::v8f32: + case MVT::v4i32: // Use fp unit for int unpack. + case MVT::v4f32: return X86ISD::SHUFPS; + case MVT::v4i64: // Use fp unit for int unpack. + case MVT::v4f64: + case MVT::v2i64: // Use fp unit for int unpack. + case MVT::v2f64: return X86ISD::SHUFPD; + default: + llvm_unreachable("Unknown type for shufp*"); + } + return 0; +} + +static +SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) { + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + EVT VT = Op.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second + // operand of these instructions is only memory, so check if there's a + // potencial load folding here, otherwise use SHUFPS or MOVSD to match the + // same masks. + bool CanFoldLoad = false; + + // Trivial case, when V2 comes from a load. + if (MayFoldVectorLoad(V2)) + CanFoldLoad = true; + + // When V1 is a load, it can be folded later into a store in isel, example: + // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) + // turns into: + // (MOVLPSmr addr:$src1, VR128:$src2) + // So, recognize this potential and also use MOVLPS or MOVLPD + if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) + CanFoldLoad = true; + + // Both of them can't be memory operations though. + if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2)) + CanFoldLoad = false; + + if (CanFoldLoad) { + if (HasXMMInt && NumElems == 2) + return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); + + if (NumElems == 4) + return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); + } + + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + // movl and movlp will both match v2i64, but v2i64 is never matched by + // movl earlier because we make it strict to avoid messing with the movlp load + // folding logic (see the code above getMOVLP call). Match it here then, + // this is horrible, but will stay like this until we move all shuffle + // matching to x86 specific nodes. Note that for the 1st condition all + // types are matched with movsd. + if (HasXMMInt) { + // FIXME: isMOVLMask should be checked and matched before getMOVLP, + // as to remove this logic from here, as much as possible + if (NumElems == 2 || !X86::isMOVLMask(SVOp)) + return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); + return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); + } + + assert(VT != MVT::v4i32 && "unsupported shuffle type"); + + // Invert the operand order and use SHUFPS to match it. + return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V2, V1, + X86::getShuffleSHUFImmediate(SVOp), DAG); +} + +static inline unsigned getUNPCKLOpcode(EVT VT) { + switch(VT.getSimpleVT().SimpleTy) { + case MVT::v4i32: return X86ISD::PUNPCKLDQ; + case MVT::v2i64: return X86ISD::PUNPCKLQDQ; + case MVT::v4f32: return X86ISD::UNPCKLPS; + case MVT::v2f64: return X86ISD::UNPCKLPD; + case MVT::v8i32: // Use fp unit for int unpack. + case MVT::v8f32: return X86ISD::VUNPCKLPSY; + case MVT::v4i64: // Use fp unit for int unpack. + case MVT::v4f64: return X86ISD::VUNPCKLPDY; + case MVT::v16i8: return X86ISD::PUNPCKLBW; + case MVT::v8i16: return X86ISD::PUNPCKLWD; + default: + llvm_unreachable("Unknown type for unpckl"); + } + return 0; +} + +static inline unsigned getUNPCKHOpcode(EVT VT) { + switch(VT.getSimpleVT().SimpleTy) { + case MVT::v4i32: return X86ISD::PUNPCKHDQ; + case MVT::v2i64: return X86ISD::PUNPCKHQDQ; + case MVT::v4f32: return X86ISD::UNPCKHPS; + case MVT::v2f64: return X86ISD::UNPCKHPD; + case MVT::v8i32: // Use fp unit for int unpack. + case MVT::v8f32: return X86ISD::VUNPCKHPSY; + case MVT::v4i64: // Use fp unit for int unpack. + case MVT::v4f64: return X86ISD::VUNPCKHPDY; + case MVT::v16i8: return X86ISD::PUNPCKHBW; + case MVT::v8i16: return X86ISD::PUNPCKHWD; + default: + llvm_unreachable("Unknown type for unpckh"); + } + return 0; +} + +static inline unsigned getVPERMILOpcode(EVT VT) { + switch(VT.getSimpleVT().SimpleTy) { + case MVT::v4i32: + case MVT::v4f32: return X86ISD::VPERMILPS; + case MVT::v2i64: + case MVT::v2f64: return X86ISD::VPERMILPD; + case MVT::v8i32: + case MVT::v8f32: return X86ISD::VPERMILPSY; + case MVT::v4i64: + case MVT::v4f64: return X86ISD::VPERMILPDY; + default: + llvm_unreachable("Unknown type for vpermil"); + } + return 0; +} + +/// isVectorBroadcast - Check if the node chain is suitable to be xformed to +/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming +/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded. +static bool isVectorBroadcast(SDValue &Op) { + EVT VT = Op.getValueType(); + bool Is256 = VT.getSizeInBits() == 256; + + assert((VT.getSizeInBits() == 128 || Is256) && + "Unsupported type for vbroadcast node"); + + SDValue V = Op; + if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + + if (Is256 && !(V.hasOneUse() && + V.getOpcode() == ISD::INSERT_SUBVECTOR && + V.getOperand(0).getOpcode() == ISD::UNDEF)) + return false; + + if (Is256) + V = V.getOperand(1); + + if (!V.hasOneUse()) + return false; + + // Check the source scalar_to_vector type. 256-bit broadcasts are + // supported for 32/64-bit sizes, while 128-bit ones are only supported + // for 32-bit scalars. + if (V.getOpcode() != ISD::SCALAR_TO_VECTOR) + return false; + + unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits(); + if (ScalarSize != 32 && ScalarSize != 64) + return false; + if (!Is256 && ScalarSize == 64) + return false; + + V = V.getOperand(0); + if (!MayFoldLoad(V)) + return false; + + // Return the load node + Op = V; + return true; +} + +static +SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, + const TargetLowering &TLI, + const X86Subtarget *Subtarget) { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + + if (isZeroShuffle(SVOp)) + return getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl); + + // Handle splat operations + if (SVOp->isSplat()) { + unsigned NumElem = VT.getVectorNumElements(); + int Size = VT.getSizeInBits(); + // Special case, this is the only place now where it's allowed to return + // a vector_shuffle operation without using a target specific node, because + // *hopefully* it will be optimized away by the dag combiner. FIXME: should + // this be moved to DAGCombine instead? + if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) + return Op; + + // Use vbroadcast whenever the splat comes from a foldable load + if (Subtarget->hasAVX() && isVectorBroadcast(V1)) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1); + + // Handle splats by matching through known shuffle masks + if ((Size == 128 && NumElem <= 4) || + (Size == 256 && NumElem < 8)) + return SDValue(); + + // All remaning splats are promoted to target supported vector shuffles. + return PromoteSplat(SVOp, DAG); + } + + // If the shuffle can be profitably rewritten as a narrower shuffle, then + // do it! + if (VT == MVT::v8i16 || VT == MVT::v16i8) { + SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); + if (NewOp.getNode()) + return DAG.getNode(ISD::BITCAST, dl, VT, NewOp); + } else if ((VT == MVT::v4i32 || + (VT == MVT::v4f32 && Subtarget->hasXMMInt()))) { + // FIXME: Figure out a cleaner way to do this. + // Try to make use of movq to zero out the top part. + if (ISD::isBuildVectorAllZeros(V2.getNode())) { + SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); + if (NewOp.getNode()) { + if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) + return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), + DAG, Subtarget, dl); + } + } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { + SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); + if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) + return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), + DAG, Subtarget, dl); + } + } + return SDValue(); +} + +SDValue +X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + unsigned NumElems = VT.getVectorNumElements(); + bool isMMX = VT.getSizeInBits() == 64; + bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + bool V1IsSplat = false; + bool V2IsSplat = false; + bool HasXMMInt = Subtarget->hasXMMInt(); + MachineFunction &MF = DAG.getMachineFunction(); + bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); + + // Shuffle operations on MMX not supported. + if (isMMX) + return Op; + + // Vector shuffle lowering takes 3 steps: + // + // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable + // narrowing and commutation of operands should be handled. + // 2) Matching of shuffles with known shuffle masks to x86 target specific + // shuffle nodes. + // 3) Rewriting of unmatched masks into new generic shuffle operations, + // so the shuffle can be broken into other shuffles and the legalizer can + // try the lowering again. + // + // The general ideia is that no vector_shuffle operation should be left to + // be matched during isel, all of them must be converted to a target specific + // node here. + + // Normalize the input vectors. Here splats, zeroed vectors, profitable + // narrowing and commutation of operands should be handled. The actual code + // doesn't include all of those, work in progress... + SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); + if (NewOp.getNode()) + return NewOp; + + // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and + // unpckh_undef). Only use pshufd if speed is more important than size. + if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) + return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); + if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) + return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + + if (X86::isMOVDDUPMask(SVOp) && + (Subtarget->hasSSE3() || Subtarget->hasAVX()) && + V2IsUndef && RelaxedMayFoldVectorLoad(V1)) + return getMOVDDup(Op, dl, V1, DAG); + + if (X86::isMOVHLPS_v_undef_Mask(SVOp)) + return getMOVHighToLow(Op, dl, DAG); + + // Use to match splats + if (HasXMMInt && X86::isUNPCKHMask(SVOp) && V2IsUndef && + (VT == MVT::v2f64 || VT == MVT::v2i64)) + return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + + if (X86::isPSHUFDMask(SVOp)) { + // The actual implementation will match the mask in the if above and then + // during isel it can match several different instructions, not only pshufd + // as its name says, sad but true, emulate the behavior for now... + if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) + return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); + + unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); + + if (HasXMMInt && (VT == MVT::v4f32 || VT == MVT::v4i32)) + return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); + + return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V1, + TargetMask, DAG); + } + + // Check if this can be converted into a logical shift. + bool isLeft = false; + unsigned ShAmt = 0; + SDValue ShVal; + bool isShift = getSubtarget()->hasXMMInt() && + isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); + if (isShift && ShVal.hasOneUse()) { + // If the shifted value has multiple uses, it may be cheaper to use + // v_set0 + movlhps or movhlps, etc. + EVT EltVT = VT.getVectorElementType(); + ShAmt *= EltVT.getSizeInBits(); + return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); + } + + if (X86::isMOVLMask(SVOp)) { + if (V1IsUndef) + return V2; + if (ISD::isBuildVectorAllZeros(V1.getNode())) + return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); + if (!X86::isMOVLPMask(SVOp)) { + if (HasXMMInt && (VT == MVT::v2i64 || VT == MVT::v2f64)) + return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); + + if (VT == MVT::v4i32 || VT == MVT::v4f32) + return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); + } + } + + // FIXME: fold these into legal mask. + if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) + return getMOVLowToHigh(Op, dl, DAG, HasXMMInt); + + if (X86::isMOVHLPSMask(SVOp)) + return getMOVHighToLow(Op, dl, DAG); + + if (X86::isMOVSHDUPMask(SVOp, Subtarget)) + return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); + + if (X86::isMOVSLDUPMask(SVOp, Subtarget)) + return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); + + if (X86::isMOVLPMask(SVOp)) + return getMOVLP(Op, dl, DAG, HasXMMInt); + + if (ShouldXformToMOVHLPS(SVOp) || + ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) + return CommuteVectorShuffle(SVOp, DAG); + + if (isShift) { + // No better options. Use a vshl / vsrl. + EVT EltVT = VT.getVectorElementType(); + ShAmt *= EltVT.getSizeInBits(); + return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); + } + + bool Commuted = false; + // FIXME: This should also accept a bitcast of a splat? Be careful, not + // 1,1,1,1 -> v8i16 though. + V1IsSplat = isSplatVector(V1.getNode()); + V2IsSplat = isSplatVector(V2.getNode()); + + // Canonicalize the splat or undef, if present, to be on the RHS. + if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { + Op = CommuteVectorShuffle(SVOp, DAG); + SVOp = cast<ShuffleVectorSDNode>(Op); + V1 = SVOp->getOperand(0); + V2 = SVOp->getOperand(1); + std::swap(V1IsSplat, V2IsSplat); + std::swap(V1IsUndef, V2IsUndef); + Commuted = true; + } + + if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { + // Shuffling low element of v1 into undef, just return v1. + if (V2IsUndef) + return V1; + // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which + // the instruction selector will not match, so get a canonical MOVL with + // swapped operands to undo the commute. + return getMOVL(DAG, dl, VT, V2, V1); + } + + if (X86::isUNPCKLMask(SVOp)) + return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); + + if (X86::isUNPCKHMask(SVOp)) + return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); + + if (V2IsSplat) { + // Normalize mask so all entries that point to V2 points to its first + // element then try to match unpck{h|l} again. If match, return a + // new vector_shuffle with the corrected mask. + SDValue NewMask = NormalizeMask(SVOp, DAG); + ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); + if (NSVOp != SVOp) { + if (X86::isUNPCKLMask(NSVOp, true)) { + return NewMask; + } else if (X86::isUNPCKHMask(NSVOp, true)) { + return NewMask; + } + } + } + + if (Commuted) { + // Commute is back and try unpck* again. + // FIXME: this seems wrong. + SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); + ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); + + if (X86::isUNPCKLMask(NewSVOp)) + return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); + + if (X86::isUNPCKHMask(NewSVOp)) + return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); + } + + // Normalize the node to match x86 shuffle ops if needed + if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) + return CommuteVectorShuffle(SVOp, DAG); + + // The checks below are all present in isShuffleMaskLegal, but they are + // inlined here right now to enable us to directly emit target specific + // nodes, and remove one by one until they don't return Op anymore. + SmallVector<int, 16> M; + SVOp->getMask(M); + + if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX())) + return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, + X86::getShufflePALIGNRImmediate(SVOp), + DAG); + + if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && + SVOp->getSplatIndex() == 0 && V2IsUndef) { + if (VT == MVT::v2f64) + return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); + if (VT == MVT::v2i64) + return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); + } + + if (isPSHUFHWMask(M, VT)) + return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, + X86::getShufflePSHUFHWImmediate(SVOp), + DAG); + + if (isPSHUFLWMask(M, VT)) + return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, + X86::getShufflePSHUFLWImmediate(SVOp), + DAG); + + if (isSHUFPMask(M, VT)) + return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, + X86::getShuffleSHUFImmediate(SVOp), DAG); + + if (X86::isUNPCKL_v_undef_Mask(SVOp)) + return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); + if (X86::isUNPCKH_v_undef_Mask(SVOp)) + return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); + + //===--------------------------------------------------------------------===// + // Generate target specific nodes for 128 or 256-bit shuffles only + // supported in the AVX instruction set. + // + + // Handle VMOVDDUPY permutations + if (isMOVDDUPYMask(SVOp, Subtarget)) + return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG); + + // Handle VPERMILPS* permutations + if (isVPERMILPSMask(M, VT, Subtarget)) + return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, + getShuffleVPERMILPSImmediate(SVOp), DAG); + + // Handle VPERMILPD* permutations + if (isVPERMILPDMask(M, VT, Subtarget)) + return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1, + getShuffleVPERMILPDImmediate(SVOp), DAG); + + // Handle VPERM2F128 permutations + if (isVPERM2F128Mask(M, VT, Subtarget)) + return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2, + getShuffleVPERM2F128Immediate(SVOp), DAG); + + // Handle VSHUFPSY permutations + if (isVSHUFPSYMask(M, VT, Subtarget)) + return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, + getShuffleVSHUFPSYImmediate(SVOp), DAG); + + // Handle VSHUFPDY permutations + if (isVSHUFPDYMask(M, VT, Subtarget)) + return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, + getShuffleVSHUFPDYImmediate(SVOp), DAG); + + //===--------------------------------------------------------------------===// + // Since no target specific shuffle was selected for this generic one, + // lower it into other known shuffles. FIXME: this isn't true yet, but + // this is the plan. + // + + // Handle v8i16 specifically since SSE can do byte extraction and insertion. + if (VT == MVT::v8i16) { + SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); + if (NewOp.getNode()) + return NewOp; + } + + if (VT == MVT::v16i8) { + SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); + if (NewOp.getNode()) + return NewOp; + } + + // Handle all 128-bit wide vectors with 4 elements, and match them with + // several different shuffle types. + if (NumElems == 4 && VT.getSizeInBits() == 128) + return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG); + + // Handle general 256-bit shuffles + if (VT.is256BitVector()) + return LowerVECTOR_SHUFFLE_256(SVOp, DAG); + + return SDValue(); +} + +SDValue +X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + + if (Op.getOperand(0).getValueType().getSizeInBits() != 128) + return SDValue(); + + if (VT.getSizeInBits() == 8) { + SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, + Op.getOperand(0), Op.getOperand(1)); + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, + DAG.getValueType(VT)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); + } else if (VT.getSizeInBits() == 16) { + unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + // If Idx is 0, it's cheaper to do a move instead of a pextrw. + if (Idx == 0) + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getNode(ISD::BITCAST, dl, + MVT::v4i32, + Op.getOperand(0)), + Op.getOperand(1))); + SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, + Op.getOperand(0), Op.getOperand(1)); + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, + DAG.getValueType(VT)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); + } else if (VT == MVT::f32) { + // EXTRACTPS outputs to a GPR32 register which will require a movd to copy + // the result back to FR32 register. It's only worth matching if the + // result has a single use which is a store or a bitcast to i32. And in + // the case of a store, it's not worth it if the index is a constant 0, + // because a MOVSSmr can be used instead, which is smaller and faster. + if (!Op.hasOneUse()) + return SDValue(); + SDNode *User = *Op.getNode()->use_begin(); + if ((User->getOpcode() != ISD::STORE || + (isa<ConstantSDNode>(Op.getOperand(1)) && + cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && + (User->getOpcode() != ISD::BITCAST || + User->getValueType(0) != MVT::i32)) + return SDValue(); + SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, + Op.getOperand(0)), + Op.getOperand(1)); + return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract); + } else if (VT == MVT::i32) { + // ExtractPS works with constant index. + if (isa<ConstantSDNode>(Op.getOperand(1))) + return Op; + } + return SDValue(); +} + + +SDValue +X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + if (!isa<ConstantSDNode>(Op.getOperand(1))) + return SDValue(); + + SDValue Vec = Op.getOperand(0); + EVT VecVT = Vec.getValueType(); + + // If this is a 256-bit vector result, first extract the 128-bit vector and + // then extract the element from the 128-bit vector. + if (VecVT.getSizeInBits() == 256) { + DebugLoc dl = Op.getNode()->getDebugLoc(); + unsigned NumElems = VecVT.getVectorNumElements(); + SDValue Idx = Op.getOperand(1); + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + + // Get the 128-bit vector. + bool Upper = IdxVal >= NumElems/2; + Vec = Extract128BitVector(Vec, + DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, + Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx); + } + + assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length"); + + if (Subtarget->hasSSE41() || Subtarget->hasAVX()) { + SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); + if (Res.getNode()) + return Res; + } + + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + // TODO: handle v16i8. + if (VT.getSizeInBits() == 16) { + SDValue Vec = Op.getOperand(0); + unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + if (Idx == 0) + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getNode(ISD::BITCAST, dl, + MVT::v4i32, Vec), + Op.getOperand(1))); + // Transform it so it match pextrw which produces a 32-bit result. + EVT EltVT = MVT::i32; + SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, + Op.getOperand(0), Op.getOperand(1)); + SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, + DAG.getValueType(VT)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); + } else if (VT.getSizeInBits() == 32) { + unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + if (Idx == 0) + return Op; + + // SHUFPS the element to the lowest double word, then movss. + int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 }; + EVT VVT = Op.getOperand(0).getValueType(); + SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), + DAG.getUNDEF(VVT), Mask); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, + DAG.getIntPtrConstant(0)); + } else if (VT.getSizeInBits() == 64) { + // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b + // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught + // to match extract_elt for f64. + unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + if (Idx == 0) + return Op; + + // UNPCKHPD the element to the lowest double word, then movsd. + // Note if the lower 64 bits of the result of the UNPCKHPD is then stored + // to a f64mem, the whole operation is folded into a single MOVHPDmr. + int Mask[2] = { 1, -1 }; + EVT VVT = Op.getOperand(0).getValueType(); + SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), + DAG.getUNDEF(VVT), Mask); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, + DAG.getIntPtrConstant(0)); + } + + return SDValue(); +} + +SDValue +X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + EVT EltVT = VT.getVectorElementType(); + DebugLoc dl = Op.getDebugLoc(); + + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDValue N2 = Op.getOperand(2); + + if (VT.getSizeInBits() == 256) + return SDValue(); + + if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && + isa<ConstantSDNode>(N2)) { + unsigned Opc; + if (VT == MVT::v8i16) + Opc = X86ISD::PINSRW; + else if (VT == MVT::v16i8) + Opc = X86ISD::PINSRB; + else + Opc = X86ISD::PINSRB; + + // Transform it so it match pinsr{b,w} which expects a GR32 as its second + // argument. + if (N1.getValueType() != MVT::i32) + N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); + if (N2.getValueType() != MVT::i32) + N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); + return DAG.getNode(Opc, dl, VT, N0, N1, N2); + } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { + // Bits [7:6] of the constant are the source select. This will always be + // zero here. The DAG Combiner may combine an extract_elt index into these + // bits. For example (insert (extract, 3), 2) could be matched by putting + // the '3' into bits [7:6] of X86ISD::INSERTPS. + // Bits [5:4] of the constant are the destination select. This is the + // value of the incoming immediate. + // Bits [3:0] of the constant are the zero mask. The DAG Combiner may + // combine either bitwise AND or insert of float 0.0 to set these bits. + N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); + // Create this as a scalar to vector.. + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); + } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { + // PINSR* works with constant index. + return Op; + } + return SDValue(); +} + +SDValue +X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + EVT EltVT = VT.getVectorElementType(); + + DebugLoc dl = Op.getDebugLoc(); + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + SDValue N2 = Op.getOperand(2); + + // If this is a 256-bit vector result, first extract the 128-bit vector, + // insert the element into the extracted half and then place it back. + if (VT.getSizeInBits() == 256) { + if (!isa<ConstantSDNode>(N2)) + return SDValue(); + + // Get the desired 128-bit vector half. + unsigned NumElems = VT.getVectorNumElements(); + unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); + bool Upper = IdxVal >= NumElems/2; + SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32); + SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl); + + // Insert the element into the desired half. + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, + N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2); + + // Insert the changed part back to the 256-bit vector + return Insert128BitVector(N0, V, Ins128Idx, DAG, dl); + } + + if (Subtarget->hasSSE41() || Subtarget->hasAVX()) + return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); + + if (EltVT == MVT::i8) + return SDValue(); + + if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { + // Transform it so it match pinsrw which expects a 16-bit value in a GR32 + // as its second argument. + if (N1.getValueType() != MVT::i32) + N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); + if (N2.getValueType() != MVT::i32) + N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); + return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); + } + return SDValue(); +} + +SDValue +X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { + LLVMContext *Context = DAG.getContext(); + DebugLoc dl = Op.getDebugLoc(); + EVT OpVT = Op.getValueType(); + + // If this is a 256-bit vector result, first insert into a 128-bit + // vector and then insert into the 256-bit vector. + if (OpVT.getSizeInBits() > 128) { + // Insert into a 128-bit vector. + EVT VT128 = EVT::getVectorVT(*Context, + OpVT.getVectorElementType(), + OpVT.getVectorNumElements() / 2); + + Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); + + // Insert the 128-bit vector. + return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op, + DAG.getConstant(0, MVT::i32), + DAG, dl); + } + + if (Op.getValueType() == MVT::v1i64 && + Op.getOperand(0).getValueType() == MVT::i64) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); + + SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); + assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && + "Expected an SSE type!"); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); +} + +// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in +// a simple subregister reference or explicit instructions to grab +// upper bits of a vector. +SDValue +X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { + if (Subtarget->hasAVX()) { + DebugLoc dl = Op.getNode()->getDebugLoc(); + SDValue Vec = Op.getNode()->getOperand(0); + SDValue Idx = Op.getNode()->getOperand(1); + + if (Op.getNode()->getValueType(0).getSizeInBits() == 128 + && Vec.getNode()->getValueType(0).getSizeInBits() == 256) { + return Extract128BitVector(Vec, Idx, DAG, dl); + } + } + return SDValue(); +} + +// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a +// simple superregister reference or explicit instructions to insert +// the upper bits of a vector. +SDValue +X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { + if (Subtarget->hasAVX()) { + DebugLoc dl = Op.getNode()->getDebugLoc(); + SDValue Vec = Op.getNode()->getOperand(0); + SDValue SubVec = Op.getNode()->getOperand(1); + SDValue Idx = Op.getNode()->getOperand(2); + + if (Op.getNode()->getValueType(0).getSizeInBits() == 256 + && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) { + return Insert128BitVector(Vec, SubVec, Idx, DAG, dl); + } + } + return SDValue(); +} + +// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as +// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is +// one of the above mentioned nodes. It has to be wrapped because otherwise +// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only +// be used to form addressing mode. These wrapped nodes will be selected +// into MOV32ri. +SDValue +X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); + + // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the + // global base reg. + unsigned char OpFlag = 0; + unsigned WrapperKind = X86ISD::Wrapper; + CodeModel::Model M = getTargetMachine().getCodeModel(); + + if (Subtarget->isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) + WrapperKind = X86ISD::WrapperRIP; + else if (Subtarget->isPICStyleGOT()) + OpFlag = X86II::MO_GOTOFF; + else if (Subtarget->isPICStyleStubPIC()) + OpFlag = X86II::MO_PIC_BASE_OFFSET; + + SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), + CP->getAlignment(), + CP->getOffset(), OpFlag); + DebugLoc DL = CP->getDebugLoc(); + Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + // With PIC, the address is actually $g + Offset. + if (OpFlag) { + Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc(), getPointerTy()), + Result); + } + + return Result; +} + +SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { + JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); + + // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the + // global base reg. + unsigned char OpFlag = 0; + unsigned WrapperKind = X86ISD::Wrapper; + CodeModel::Model M = getTargetMachine().getCodeModel(); + + if (Subtarget->isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) + WrapperKind = X86ISD::WrapperRIP; + else if (Subtarget->isPICStyleGOT()) + OpFlag = X86II::MO_GOTOFF; + else if (Subtarget->isPICStyleStubPIC()) + OpFlag = X86II::MO_PIC_BASE_OFFSET; + + SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), + OpFlag); + DebugLoc DL = JT->getDebugLoc(); + Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + + // With PIC, the address is actually $g + Offset. + if (OpFlag) + Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc(), getPointerTy()), + Result); + + return Result; +} + +SDValue +X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { + const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); + + // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the + // global base reg. + unsigned char OpFlag = 0; + unsigned WrapperKind = X86ISD::Wrapper; + CodeModel::Model M = getTargetMachine().getCodeModel(); + + if (Subtarget->isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) { + if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF()) + OpFlag = X86II::MO_GOTPCREL; + WrapperKind = X86ISD::WrapperRIP; + } else if (Subtarget->isPICStyleGOT()) { + OpFlag = X86II::MO_GOT; + } else if (Subtarget->isPICStyleStubPIC()) { + OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE; + } else if (Subtarget->isPICStyleStubNoDynamic()) { + OpFlag = X86II::MO_DARWIN_NONLAZY; + } + + SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); + + DebugLoc DL = Op.getDebugLoc(); + Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + + + // With PIC, the address is actually $g + Offset. + if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && + !Subtarget->is64Bit()) { + Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc(), getPointerTy()), + Result); + } + + // For symbols that require a load from a stub to get the address, emit the + // load. + if (isGlobalStubReference(OpFlag)) + Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(), false, false, 0); + + return Result; +} + +SDValue +X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { + // Create the TargetBlockAddressAddress node. + unsigned char OpFlags = + Subtarget->ClassifyBlockAddressReference(); + CodeModel::Model M = getTargetMachine().getCodeModel(); + const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); + DebugLoc dl = Op.getDebugLoc(); + SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), + /*isTarget=*/true, OpFlags); + + if (Subtarget->isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) + Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); + else + Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); + + // With PIC, the address is actually $g + Offset. + if (isGlobalRelativeToPICBase(OpFlags)) { + Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), + Result); + } + + return Result; +} + +SDValue +X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, + int64_t Offset, + SelectionDAG &DAG) const { + // Create the TargetGlobalAddress node, folding in the constant + // offset if it is legal. + unsigned char OpFlags = + Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); + CodeModel::Model M = getTargetMachine().getCodeModel(); + SDValue Result; + if (OpFlags == X86II::MO_NO_FLAG && + X86::isOffsetSuitableForCodeModel(Offset, M)) { + // A direct static reference to a global. + Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); + Offset = 0; + } else { + Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); + } + + if (Subtarget->isPICStyleRIPRel() && + (M == CodeModel::Small || M == CodeModel::Kernel)) + Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); + else + Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); + + // With PIC, the address is actually $g + Offset. + if (isGlobalRelativeToPICBase(OpFlags)) { + Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), + Result); + } + + // For globals that require a load from a stub to get the address, emit the + // load. + if (isGlobalStubReference(OpFlags)) + Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, + MachinePointerInfo::getGOT(), false, false, 0); + + // If there was a non-zero offset that we didn't fold, create an explicit + // addition for it. + if (Offset != 0) + Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, + DAG.getConstant(Offset, getPointerTy())); + + return Result; +} + +SDValue +X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { + const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); + int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); + return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); +} + +static SDValue +GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, + SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, + unsigned char OperandFlags) { + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + DebugLoc dl = GA->getDebugLoc(); + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + GA->getValueType(0), + GA->getOffset(), + OperandFlags); + if (InFlag) { + SDValue Ops[] = { Chain, TGA, *InFlag }; + Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); + } else { + SDValue Ops[] = { Chain, TGA }; + Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); + } + + // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. + MFI->setAdjustsStack(true); + + SDValue Flag = Chain.getValue(1); + return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); +} + +// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit +static SDValue +LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, + const EVT PtrVT) { + SDValue InFlag; + DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better + SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc(), PtrVT), InFlag); + InFlag = Chain.getValue(1); + + return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); +} + +// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit +static SDValue +LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, + const EVT PtrVT) { + return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, + X86::RAX, X86II::MO_TLSGD); +} + +// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or +// "local exec" model. +static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, + const EVT PtrVT, TLSModel::Model model, + bool is64Bit) { + DebugLoc dl = GA->getDebugLoc(); + + // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). + Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), + is64Bit ? 257 : 256)); + + SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), + DAG.getIntPtrConstant(0), + MachinePointerInfo(Ptr), false, false, 0); + + unsigned char OperandFlags = 0; + // Most TLS accesses are not RIP relative, even on x86-64. One exception is + // initialexec. + unsigned WrapperKind = X86ISD::Wrapper; + if (model == TLSModel::LocalExec) { + OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; + } else if (is64Bit) { + assert(model == TLSModel::InitialExec); + OperandFlags = X86II::MO_GOTTPOFF; + WrapperKind = X86ISD::WrapperRIP; + } else { + assert(model == TLSModel::InitialExec); + OperandFlags = X86II::MO_INDNTPOFF; + } + + // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial + // exec) + SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, + GA->getValueType(0), + GA->getOffset(), OperandFlags); + SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); + + if (model == TLSModel::InitialExec) + Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, + MachinePointerInfo::getGOT(), false, false, 0); + + // The address of the thread local variable is the add of the thread + // pointer with the offset of the variable. + return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); +} + +SDValue +X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { + + GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + const GlobalValue *GV = GA->getGlobal(); + + if (Subtarget->isTargetELF()) { + // TODO: implement the "local dynamic" model + // TODO: implement the "initial exec"model for pic executables + + // If GV is an alias then use the aliasee for determining + // thread-localness. + if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) + GV = GA->resolveAliasedGlobal(false); + + TLSModel::Model model + = getTLSModel(GV, getTargetMachine().getRelocationModel()); + + switch (model) { + case TLSModel::GeneralDynamic: + case TLSModel::LocalDynamic: // not implemented + if (Subtarget->is64Bit()) + return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); + return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); + + case TLSModel::InitialExec: + case TLSModel::LocalExec: + return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, + Subtarget->is64Bit()); + } + } else if (Subtarget->isTargetDarwin()) { + // Darwin only has one model of TLS. Lower to that. + unsigned char OpFlag = 0; + unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? + X86ISD::WrapperRIP : X86ISD::Wrapper; + + // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the + // global base reg. + bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && + !Subtarget->is64Bit(); + if (PIC32) + OpFlag = X86II::MO_TLVP_PIC_BASE; + else + OpFlag = X86II::MO_TLVP; + DebugLoc DL = Op.getDebugLoc(); + SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, + GA->getValueType(0), + GA->getOffset(), OpFlag); + SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); + + // With PIC32, the address is actually $g + Offset. + if (PIC32) + Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc(), getPointerTy()), + Offset); + + // Lowering the machine isd will make sure everything is in the right + // location. + SDValue Chain = DAG.getEntryNode(); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Args[] = { Chain, Offset }; + Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2); + + // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setAdjustsStack(true); + + // And our return value (tls address) is in the standard call return value + // location. + unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; + return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(), + Chain.getValue(1)); + } + + assert(false && + "TLS not implemented for this target."); + + llvm_unreachable("Unreachable"); + return SDValue(); +} + + +/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and +/// take a 2 x i32 value to shift plus a shift amount. +SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); + bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, + DAG.getConstant(VTBits - 1, MVT::i8)) + : DAG.getConstant(0, VT); + + SDValue Tmp2, Tmp3; + if (Op.getOpcode() == ISD::SHL_PARTS) { + Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); + Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + } else { + Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); + Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); + } + + SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, + DAG.getConstant(VTBits, MVT::i8)); + SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, + AndNode, DAG.getConstant(0, MVT::i8)); + + SDValue Hi, Lo; + SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); + SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; + SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; + + if (Op.getOpcode() == ISD::SHL_PARTS) { + Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); + Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); + } else { + Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); + Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); + } + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, 2, dl); +} + +SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + EVT SrcVT = Op.getOperand(0).getValueType(); + + if (SrcVT.isVector()) + return SDValue(); + + assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && + "Unknown SINT_TO_FP to lower!"); + + // These are really Legal; return the operand so the caller accepts it as + // Legal. + if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) + return Op; + if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && + Subtarget->is64Bit()) { + return Op; + } + + DebugLoc dl = Op.getDebugLoc(); + unsigned Size = SrcVT.getSizeInBits()/8; + MachineFunction &MF = DAG.getMachineFunction(); + int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); + SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + StackSlot, + MachinePointerInfo::getFixedStack(SSFI), + false, false, 0); + return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); +} + +SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, + SDValue StackSlot, + SelectionDAG &DAG) const { + // Build the FILD + DebugLoc DL = Op.getDebugLoc(); + SDVTList Tys; + bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); + if (useSSE) + Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); + else + Tys = DAG.getVTList(Op.getValueType(), MVT::Other); + + unsigned ByteSize = SrcVT.getSizeInBits()/8; + + FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); + MachineMemOperand *MMO; + if (FI) { + int SSFI = FI->getIndex(); + MMO = + DAG.getMachineFunction() + .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOLoad, ByteSize, ByteSize); + } else { + MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); + StackSlot = StackSlot.getOperand(1); + } + SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; + SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : + X86ISD::FILD, DL, + Tys, Ops, array_lengthof(Ops), + SrcVT, MMO); + + if (useSSE) { + Chain = Result.getValue(1); + SDValue InFlag = Result.getValue(2); + + // FIXME: Currently the FST is flagged to the FILD_FLAG. This + // shouldn't be necessary except that RFP cannot be live across + // multiple blocks. When stackifier is fixed, they can be uncoupled. + MachineFunction &MF = DAG.getMachineFunction(); + unsigned SSFISize = Op.getValueType().getSizeInBits()/8; + int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); + SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = { + Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag + }; + MachineMemOperand *MMO = + DAG.getMachineFunction() + .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOStore, SSFISize, SSFISize); + + Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, + Ops, array_lengthof(Ops), + Op.getValueType(), MMO); + Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, + MachinePointerInfo::getFixedStack(SSFI), + false, false, 0); + } + + return Result; +} + +// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. +SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, + SelectionDAG &DAG) const { + // This algorithm is not obvious. Here it is in C code, more or less: + /* + double uint64_to_double( uint32_t hi, uint32_t lo ) { + static const __m128i exp = { 0x4330000045300000ULL, 0 }; + static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; + + // Copy ints to xmm registers. + __m128i xh = _mm_cvtsi32_si128( hi ); + __m128i xl = _mm_cvtsi32_si128( lo ); + + // Combine into low half of a single xmm register. + __m128i x = _mm_unpacklo_epi32( xh, xl ); + __m128d d; + double sd; + + // Merge in appropriate exponents to give the integer bits the right + // magnitude. + x = _mm_unpacklo_epi32( x, exp ); + + // Subtract away the biases to deal with the IEEE-754 double precision + // implicit 1. + d = _mm_sub_pd( (__m128d) x, bias ); + + // All conversions up to here are exact. The correctly rounded result is + // calculated using the current rounding mode using the following + // horizontal add. + d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); + _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this + // store doesn't really need to be here (except + // maybe to zero the other double) + return sd; + } + */ + + DebugLoc dl = Op.getDebugLoc(); + LLVMContext *Context = DAG.getContext(); + + // Build some magic constants. + std::vector<Constant*> CV0; + CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); + CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); + CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); + CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); + Constant *C0 = ConstantVector::get(CV0); + SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); + + std::vector<Constant*> CV1; + CV1.push_back( + ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); + CV1.push_back( + ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); + Constant *C1 = ConstantVector::get(CV1); + SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); + + SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, + DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + Op.getOperand(0), + DAG.getIntPtrConstant(1))); + SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, + DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + Op.getOperand(0), + DAG.getIntPtrConstant(0))); + SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); + SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, + MachinePointerInfo::getConstantPool(), + false, false, 16); + SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); + SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); + SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, + MachinePointerInfo::getConstantPool(), + false, false, 16); + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); + + // Add the halves; easiest way is to swap them into another reg first. + int ShufMask[2] = { 1, -1 }; + SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, + DAG.getUNDEF(MVT::v2f64), ShufMask); + SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, + DAG.getIntPtrConstant(0)); +} + +// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. +SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + // FP constant to bias correct the final result. + SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), + MVT::f64); + + // Load the 32-bit value into an XMM register. + SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, + Op.getOperand(0)); + + // Zero out the upper parts of the register. + Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget->hasXMMInt(), + DAG); + + Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load), + DAG.getIntPtrConstant(0)); + + // Or the load with the bias. + SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + MVT::v2f64, Load)), + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, + DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + MVT::v2f64, Bias))); + Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or), + DAG.getIntPtrConstant(0)); + + // Subtract the bias. + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); + + // Handle final rounding. + EVT DestVT = Op.getValueType(); + + if (DestVT.bitsLT(MVT::f64)) { + return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, + DAG.getIntPtrConstant(0)); + } else if (DestVT.bitsGT(MVT::f64)) { + return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); + } + + // Handle final rounding. + return Sub; +} + +SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + SDValue N0 = Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); + + // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't + // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform + // the optimization here. + if (DAG.SignBitIsZero(N0)) + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); + + EVT SrcVT = N0.getValueType(); + EVT DstVT = Op.getValueType(); + if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) + return LowerUINT_TO_FP_i64(Op, DAG); + else if (SrcVT == MVT::i32 && X86ScalarSSEf64) + return LowerUINT_TO_FP_i32(Op, DAG); + + // Make a 64-bit buffer, and use it to build an FILD. + SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); + if (SrcVT == MVT::i32) { + SDValue WordOff = DAG.getConstant(4, getPointerTy()); + SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, + getPointerTy(), StackSlot, WordOff); + SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + StackSlot, MachinePointerInfo(), + false, false, 0); + SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), + OffsetSlot, MachinePointerInfo(), + false, false, 0); + SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); + return Fild; + } + + assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); + SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), + StackSlot, MachinePointerInfo(), + false, false, 0); + // For i64 source, we need to add the appropriate power of 2 if the input + // was negative. This is the same as the optimization in + // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, + // we must be careful to do the computation in x87 extended precision, not + // in SSE. (The generic code can't know it's OK to do this, or how to.) + int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); + MachineMemOperand *MMO = + DAG.getMachineFunction() + .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOLoad, 8, 8); + + SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); + SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; + SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, + MVT::i64, MMO); + + APInt FF(32, 0x5F800000ULL); + + // Check whether the sign bit is set. + SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), + Op.getOperand(0), DAG.getConstant(0, MVT::i64), + ISD::SETLT); + + // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. + SDValue FudgePtr = DAG.getConstantPool( + ConstantInt::get(*DAG.getContext(), FF.zext(64)), + getPointerTy()); + + // Get a pointer to FF if the sign bit was set, or to 0 otherwise. + SDValue Zero = DAG.getIntPtrConstant(0); + SDValue Four = DAG.getIntPtrConstant(4); + SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, + Zero, Four); + FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); + + // Load the value out, extending it from f32 to f80. + // FIXME: Avoid the extend by constructing the right constant pool? + SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), + FudgePtr, MachinePointerInfo::getConstantPool(), + MVT::f32, false, false, 4); + // Extend everything to 80 bits to force it to be done on x87. + SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); + return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); +} + +std::pair<SDValue,SDValue> X86TargetLowering:: +FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { + DebugLoc DL = Op.getDebugLoc(); + + EVT DstTy = Op.getValueType(); + + if (!IsSigned) { + assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); + DstTy = MVT::i64; + } + + assert(DstTy.getSimpleVT() <= MVT::i64 && + DstTy.getSimpleVT() >= MVT::i16 && + "Unknown FP_TO_SINT to lower!"); + + // These are really Legal. + if (DstTy == MVT::i32 && + isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) + return std::make_pair(SDValue(), SDValue()); + if (Subtarget->is64Bit() && + DstTy == MVT::i64 && + isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) + return std::make_pair(SDValue(), SDValue()); + + // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary + // stack slot. + MachineFunction &MF = DAG.getMachineFunction(); + unsigned MemSize = DstTy.getSizeInBits()/8; + int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); + SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + + + + unsigned Opc; + switch (DstTy.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); + case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; + case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; + case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; + } + + SDValue Chain = DAG.getEntryNode(); + SDValue Value = Op.getOperand(0); + EVT TheVT = Op.getOperand(0).getValueType(); + if (isScalarFPTypeInSSEReg(TheVT)) { + assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); + Chain = DAG.getStore(Chain, DL, Value, StackSlot, + MachinePointerInfo::getFixedStack(SSFI), + false, false, 0); + SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); + SDValue Ops[] = { + Chain, StackSlot, DAG.getValueType(TheVT) + }; + + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOLoad, MemSize, MemSize); + Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, + DstTy, MMO); + Chain = Value.getValue(1); + SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); + StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + } + + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOStore, MemSize, MemSize); + + // Build the FP_TO_INT*_IN_MEM + SDValue Ops[] = { Chain, Value, StackSlot }; + SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), + Ops, 3, DstTy, MMO); + + return std::make_pair(FIST, StackSlot); +} + +SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, + SelectionDAG &DAG) const { + if (Op.getValueType().isVector()) + return SDValue(); + + std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); + SDValue FIST = Vals.first, StackSlot = Vals.second; + // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. + if (FIST.getNode() == 0) return Op; + + // Load the result. + return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), + FIST, StackSlot, MachinePointerInfo(), false, false, 0); +} + +SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, + SelectionDAG &DAG) const { + std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); + SDValue FIST = Vals.first, StackSlot = Vals.second; + assert(FIST.getNode() && "Unexpected failure"); + + // Load the result. + return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), + FIST, StackSlot, MachinePointerInfo(), false, false, 0); +} + +SDValue X86TargetLowering::LowerFABS(SDValue Op, + SelectionDAG &DAG) const { + LLVMContext *Context = DAG.getContext(); + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + EVT EltVT = VT; + if (VT.isVector()) + EltVT = VT.getVectorElementType(); + std::vector<Constant*> CV; + if (EltVT == MVT::f64) { + Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); + CV.push_back(C); + CV.push_back(C); + } else { + Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); + CV.push_back(C); + CV.push_back(C); + CV.push_back(C); + CV.push_back(C); + } + Constant *C = ConstantVector::get(CV); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, 16); + return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); +} + +SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { + LLVMContext *Context = DAG.getContext(); + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + EVT EltVT = VT; + if (VT.isVector()) + EltVT = VT.getVectorElementType(); + std::vector<Constant*> CV; + if (EltVT == MVT::f64) { + Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); + CV.push_back(C); + CV.push_back(C); + } else { + Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); + CV.push_back(C); + CV.push_back(C); + CV.push_back(C); + CV.push_back(C); + } + Constant *C = ConstantVector::get(CV); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, 16); + if (VT.isVector()) { + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(ISD::XOR, dl, MVT::v2i64, + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, + Op.getOperand(0)), + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask))); + } else { + return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); + } +} + +SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { + LLVMContext *Context = DAG.getContext(); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + EVT SrcVT = Op1.getValueType(); + + // If second operand is smaller, extend it first. + if (SrcVT.bitsLT(VT)) { + Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); + SrcVT = VT; + } + // And if it is bigger, shrink it first. + if (SrcVT.bitsGT(VT)) { + Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); + SrcVT = VT; + } + + // At this point the operands and the result should have the same + // type, and that won't be f80 since that is not custom lowered. + + // First get the sign bit of second operand. + std::vector<Constant*> CV; + if (SrcVT == MVT::f64) { + CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); + CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); + } else { + CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); + CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); + } + Constant *C = ConstantVector::get(CV); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, 16); + SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); + + // Shift sign bit right or left if the two operands have different types. + if (SrcVT.bitsGT(VT)) { + // Op0 is MVT::f32, Op1 is MVT::f64. + SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); + SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, + DAG.getConstant(32, MVT::i32)); + SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); + SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, + DAG.getIntPtrConstant(0)); + } + + // Clear first operand sign bit. + CV.clear(); + if (VT == MVT::f64) { + CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); + CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); + } else { + CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); + CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); + CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); + } + C = ConstantVector::get(CV); + CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, 16); + SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); + + // Or the value with the sign bit. + return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); +} + +SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { + SDValue N0 = Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + + // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1). + SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0, + DAG.getConstant(1, VT)); + return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); +} + +/// Emit nodes that will be selected as "test Op0,Op0", or something +/// equivalent. +SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, + SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + + // CF and OF aren't always set the way we want. Determine which + // of these we need. + bool NeedCF = false; + bool NeedOF = false; + switch (X86CC) { + default: break; + case X86::COND_A: case X86::COND_AE: + case X86::COND_B: case X86::COND_BE: + NeedCF = true; + break; + case X86::COND_G: case X86::COND_GE: + case X86::COND_L: case X86::COND_LE: + case X86::COND_O: case X86::COND_NO: + NeedOF = true; + break; + } + + // See if we can use the EFLAGS value from the operand instead of + // doing a separate TEST. TEST always sets OF and CF to 0, so unless + // we prove that the arithmetic won't overflow, we can't use OF or CF. + if (Op.getResNo() != 0 || NeedOF || NeedCF) + // Emit a CMP with 0, which is the TEST pattern. + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, + DAG.getConstant(0, Op.getValueType())); + + unsigned Opcode = 0; + unsigned NumOperands = 0; + switch (Op.getNode()->getOpcode()) { + case ISD::ADD: + // Due to an isel shortcoming, be conservative if this add is likely to be + // selected as part of a load-modify-store instruction. When the root node + // in a match is a store, isel doesn't know how to remap non-chain non-flag + // uses of other nodes in the match, such as the ADD in this case. This + // leads to the ADD being left around and reselected, with the result being + // two adds in the output. Alas, even if none our users are stores, that + // doesn't prove we're O.K. Ergo, if we have any parents that aren't + // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require + // climbing the DAG back to the root, and it doesn't seem to be worth the + // effort. + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = Op.getNode()->use_end(); UI != UE; ++UI) + if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) + goto default_case; + + if (ConstantSDNode *C = + dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { + // An add of one will be selected as an INC. + if (C->getAPIntValue() == 1) { + Opcode = X86ISD::INC; + NumOperands = 1; + break; + } + + // An add of negative one (subtract of one) will be selected as a DEC. + if (C->getAPIntValue().isAllOnesValue()) { + Opcode = X86ISD::DEC; + NumOperands = 1; + break; + } + } + + // Otherwise use a regular EFLAGS-setting add. + Opcode = X86ISD::ADD; + NumOperands = 2; + break; + case ISD::AND: { + // If the primary and result isn't used, don't bother using X86ISD::AND, + // because a TEST instruction will be better. + bool NonFlagUse = false; + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = Op.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + unsigned UOpNo = UI.getOperandNo(); + if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { + // Look pass truncate. + UOpNo = User->use_begin().getOperandNo(); + User = *User->use_begin(); + } + + if (User->getOpcode() != ISD::BRCOND && + User->getOpcode() != ISD::SETCC && + (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { + NonFlagUse = true; + break; + } + } + + if (!NonFlagUse) + break; + } + // FALL THROUGH + case ISD::SUB: + case ISD::OR: + case ISD::XOR: + // Due to the ISEL shortcoming noted above, be conservative if this op is + // likely to be selected as part of a load-modify-store instruction. + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = Op.getNode()->use_end(); UI != UE; ++UI) + if (UI->getOpcode() == ISD::STORE) + goto default_case; + + // Otherwise use a regular EFLAGS-setting instruction. + switch (Op.getNode()->getOpcode()) { + default: llvm_unreachable("unexpected operator!"); + case ISD::SUB: Opcode = X86ISD::SUB; break; + case ISD::OR: Opcode = X86ISD::OR; break; + case ISD::XOR: Opcode = X86ISD::XOR; break; + case ISD::AND: Opcode = X86ISD::AND; break; + } + + NumOperands = 2; + break; + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::INC: + case X86ISD::DEC: + case X86ISD::OR: + case X86ISD::XOR: + case X86ISD::AND: + return SDValue(Op.getNode(), 1); + default: + default_case: + break; + } + + if (Opcode == 0) + // Emit a CMP with 0, which is the TEST pattern. + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, + DAG.getConstant(0, Op.getValueType())); + + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + SmallVector<SDValue, 4> Ops; + for (unsigned i = 0; i != NumOperands; ++i) + Ops.push_back(Op.getOperand(i)); + + SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); + DAG.ReplaceAllUsesWith(Op, New); + return SDValue(New.getNode(), 1); +} + +/// Emit nodes that will be selected as "cmp Op0,Op1", or something +/// equivalent. +SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, + SelectionDAG &DAG) const { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) + if (C->getAPIntValue() == 0) + return EmitTest(Op0, X86CC, DAG); + + DebugLoc dl = Op0.getDebugLoc(); + return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); +} + +/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node +/// if it's possible. +SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, + DebugLoc dl, SelectionDAG &DAG) const { + SDValue Op0 = And.getOperand(0); + SDValue Op1 = And.getOperand(1); + if (Op0.getOpcode() == ISD::TRUNCATE) + Op0 = Op0.getOperand(0); + if (Op1.getOpcode() == ISD::TRUNCATE) + Op1 = Op1.getOperand(0); + + SDValue LHS, RHS; + if (Op1.getOpcode() == ISD::SHL) + std::swap(Op0, Op1); + if (Op0.getOpcode() == ISD::SHL) { + if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) + if (And00C->getZExtValue() == 1) { + // If we looked past a truncate, check that it's only truncating away + // known zeros. + unsigned BitWidth = Op0.getValueSizeInBits(); + unsigned AndBitWidth = And.getValueSizeInBits(); + if (BitWidth > AndBitWidth) { + APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; + DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); + if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) + return SDValue(); + } + LHS = Op1; + RHS = Op0.getOperand(1); + } + } else if (Op1.getOpcode() == ISD::Constant) { + ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); + SDValue AndLHS = Op0; + if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { + LHS = AndLHS.getOperand(0); + RHS = AndLHS.getOperand(1); + } + } + + if (LHS.getNode()) { + // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT + // instruction. Since the shift amount is in-range-or-undefined, we know + // that doing a bittest on the i32 value is ok. We extend to i32 because + // the encoding for the i16 version is larger than the i32 version. + // Also promote i16 to i32 for performance / code size reason. + if (LHS.getValueType() == MVT::i8 || + LHS.getValueType() == MVT::i16) + LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); + + // If the operand types disagree, extend the shift amount to match. Since + // BT ignores high bits (like shifts) we can use anyextend. + if (LHS.getValueType() != RHS.getValueType()) + RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); + + SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); + unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(Cond, MVT::i8), BT); + } + + return SDValue(); +} + +SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { + + if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); + + assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + DebugLoc dl = Op.getDebugLoc(); + ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); + + // Optimize to BT if possible. + // Lower (X & (1 << N)) == 0 to BT(X, N). + // Lower ((X >>u N) & 1) != 0 to BT(X, N). + // Lower ((X >>s N) & 1) != 0 to BT(X, N). + if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && + Op1.getOpcode() == ISD::Constant && + cast<ConstantSDNode>(Op1)->isNullValue() && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); + if (NewSetCC.getNode()) + return NewSetCC; + } + + // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of + // these. + if (Op1.getOpcode() == ISD::Constant && + (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || + cast<ConstantSDNode>(Op1)->isNullValue()) && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + + // If the input is a setcc, then reuse the input setcc or use a new one with + // the inverted condition. + if (Op0.getOpcode() == X86ISD::SETCC) { + X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); + bool Invert = (CC == ISD::SETNE) ^ + cast<ConstantSDNode>(Op1)->isNullValue(); + if (!Invert) return Op0; + + CCode = X86::GetOppositeBranchCondition(CCode); + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); + } + } + + bool isFP = Op1.getValueType().isFloatingPoint(); + unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); + if (X86CC == X86::COND_INVALID) + return SDValue(); + + SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG); + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, MVT::i8), EFLAGS); +} + +// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128 +// ones, and then concatenate the result back. +static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC && + "Unsupported value type for operation"); + + int NumElems = VT.getVectorNumElements(); + DebugLoc dl = Op.getDebugLoc(); + SDValue CC = Op.getOperand(2); + SDValue Idx0 = DAG.getConstant(0, MVT::i32); + SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); + + // Extract the LHS vectors + SDValue LHS = Op.getOperand(0); + SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); + + // Extract the RHS vectors + SDValue RHS = Op.getOperand(1); + SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl); + SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl); + + // Issue the operation on the smaller types and concatenate the result back + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); +} + + +SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { + SDValue Cond; + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue CC = Op.getOperand(2); + EVT VT = Op.getValueType(); + ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); + DebugLoc dl = Op.getDebugLoc(); + + if (isFP) { + unsigned SSECC = 8; + EVT EltVT = Op0.getValueType().getVectorElementType(); + assert(EltVT == MVT::f32 || EltVT == MVT::f64); + + unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD; + bool Swap = false; + + // SSE Condition code mapping: + // 0 - EQ + // 1 - LT + // 2 - LE + // 3 - UNORD + // 4 - NEQ + // 5 - NLT + // 6 - NLE + // 7 - ORD + switch (SetCCOpcode) { + default: break; + case ISD::SETOEQ: + case ISD::SETEQ: SSECC = 0; break; + case ISD::SETOGT: + case ISD::SETGT: Swap = true; // Fallthrough + case ISD::SETLT: + case ISD::SETOLT: SSECC = 1; break; + case ISD::SETOGE: + case ISD::SETGE: Swap = true; // Fallthrough + case ISD::SETLE: + case ISD::SETOLE: SSECC = 2; break; + case ISD::SETUO: SSECC = 3; break; + case ISD::SETUNE: + case ISD::SETNE: SSECC = 4; break; + case ISD::SETULE: Swap = true; + case ISD::SETUGE: SSECC = 5; break; + case ISD::SETULT: Swap = true; + case ISD::SETUGT: SSECC = 6; break; + case ISD::SETO: SSECC = 7; break; + } + if (Swap) + std::swap(Op0, Op1); + + // In the two special cases we can't handle, emit two comparisons. + if (SSECC == 8) { + if (SetCCOpcode == ISD::SETUEQ) { + SDValue UNORD, EQ; + UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); + EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); + return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); + } + else if (SetCCOpcode == ISD::SETONE) { + SDValue ORD, NEQ; + ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); + NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); + return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); + } + llvm_unreachable("Illegal FP comparison"); + } + // Handle all other FP comparisons here. + return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); + } + + // Break 256-bit integer vector compare into smaller ones. + if (!isFP && VT.getSizeInBits() == 256) + return Lower256IntVSETCC(Op, DAG); + + // We are handling one of the integer comparisons here. Since SSE only has + // GT and EQ comparisons for integer, swapping operands and multiple + // operations may be required for some comparisons. + unsigned Opc = 0, EQOpc = 0, GTOpc = 0; + bool Swap = false, Invert = false, FlipSigns = false; + + switch (VT.getSimpleVT().SimpleTy) { + default: break; + case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; + case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; + case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; + case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; + } + + switch (SetCCOpcode) { + default: break; + case ISD::SETNE: Invert = true; + case ISD::SETEQ: Opc = EQOpc; break; + case ISD::SETLT: Swap = true; + case ISD::SETGT: Opc = GTOpc; break; + case ISD::SETGE: Swap = true; + case ISD::SETLE: Opc = GTOpc; Invert = true; break; + case ISD::SETULT: Swap = true; + case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; + case ISD::SETUGE: Swap = true; + case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; + } + if (Swap) + std::swap(Op0, Op1); + + // Check that the operation in question is available (most are plain SSE2, + // but PCMPGTQ and PCMPEQQ have different requirements). + if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42() && !Subtarget->hasAVX()) + return SDValue(); + if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41() && !Subtarget->hasAVX()) + return SDValue(); + + // Since SSE has no unsigned integer comparisons, we need to flip the sign + // bits of the inputs before performing those operations. + if (FlipSigns) { + EVT EltVT = VT.getVectorElementType(); + SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), + EltVT); + std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); + SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], + SignBits.size()); + Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); + Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); + } + + SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); + + // If the logical-not of the result is required, perform that now. + if (Invert) + Result = DAG.getNOT(dl, Result, VT); + + return Result; +} + +// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. +static bool isX86LogicalCmp(SDValue Op) { + unsigned Opc = Op.getNode()->getOpcode(); + if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) + return true; + if (Op.getResNo() == 1 && + (Opc == X86ISD::ADD || + Opc == X86ISD::SUB || + Opc == X86ISD::ADC || + Opc == X86ISD::SBB || + Opc == X86ISD::SMUL || + Opc == X86ISD::UMUL || + Opc == X86ISD::INC || + Opc == X86ISD::DEC || + Opc == X86ISD::OR || + Opc == X86ISD::XOR || + Opc == X86ISD::AND)) + return true; + + if (Op.getResNo() == 2 && Opc == X86ISD::UMUL) + return true; + + return false; +} + +static bool isZero(SDValue V) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); + return C && C->isNullValue(); +} + +static bool isAllOnes(SDValue V) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); + return C && C->isAllOnesValue(); +} + +SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { + bool addTest = true; + SDValue Cond = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + DebugLoc DL = Op.getDebugLoc(); + SDValue CC; + + if (Cond.getOpcode() == ISD::SETCC) { + SDValue NewCond = LowerSETCC(Cond, DAG); + if (NewCond.getNode()) + Cond = NewCond; + } + + // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y + // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y + // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y + // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y + if (Cond.getOpcode() == X86ISD::SETCC && + Cond.getOperand(1).getOpcode() == X86ISD::CMP && + isZero(Cond.getOperand(1).getOperand(1))) { + SDValue Cmp = Cond.getOperand(1); + + unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); + + if ((isAllOnes(Op1) || isAllOnes(Op2)) && + (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { + SDValue Y = isAllOnes(Op2) ? Op1 : Op2; + + SDValue CmpOp0 = Cmp.getOperand(0); + Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, + CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); + + SDValue Res = // Res = 0 or -1. + DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getConstant(X86::COND_B, MVT::i8), Cmp); + + if (isAllOnes(Op1) != (CondCode == X86::COND_E)) + Res = DAG.getNOT(DL, Res, Res.getValueType()); + + ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); + if (N2C == 0 || !N2C->isNullValue()) + Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); + return Res; + } + } + + // Look past (and (setcc_carry (cmp ...)), 1). + if (Cond.getOpcode() == ISD::AND && + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); + if (C && C->getAPIntValue() == 1) + Cond = Cond.getOperand(0); + } + + // If condition flag is set by a X86ISD::CMP, then use it as the condition + // setting operand in place of the X86ISD::SETCC. + if (Cond.getOpcode() == X86ISD::SETCC || + Cond.getOpcode() == X86ISD::SETCC_CARRY) { + CC = Cond.getOperand(0); + + SDValue Cmp = Cond.getOperand(1); + unsigned Opc = Cmp.getOpcode(); + EVT VT = Op.getValueType(); + + bool IllegalFPCMov = false; + if (VT.isFloatingPoint() && !VT.isVector() && + !isScalarFPTypeInSSEReg(VT)) // FPStack? + IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); + + if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || + Opc == X86ISD::BT) { // FIXME + Cond = Cmp; + addTest = false; + } + } + + if (addTest) { + // Look pass the truncate. + if (Cond.getOpcode() == ISD::TRUNCATE) + Cond = Cond.getOperand(0); + + // We know the result of AND is compared against zero. Try to match + // it to BT. + if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { + SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); + if (NewSetCC.getNode()) { + CC = NewSetCC.getOperand(0); + Cond = NewSetCC.getOperand(1); + addTest = false; + } + } + } + + if (addTest) { + CC = DAG.getConstant(X86::COND_NE, MVT::i8); + Cond = EmitTest(Cond, X86::COND_NE, DAG); + } + + // a < b ? -1 : 0 -> RES = ~setcc_carry + // a < b ? 0 : -1 -> RES = setcc_carry + // a >= b ? -1 : 0 -> RES = setcc_carry + // a >= b ? 0 : -1 -> RES = ~setcc_carry + if (Cond.getOpcode() == X86ISD::CMP) { + unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); + + if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && + (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { + SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), + DAG.getConstant(X86::COND_B, MVT::i8), Cond); + if (isAllOnes(Op1) != (CondCode == X86::COND_B)) + return DAG.getNOT(DL, Res, Res.getValueType()); + return Res; + } + } + + // X86ISD::CMOV means set the result (which is operand 1) to the RHS if + // condition is true. + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); + SDValue Ops[] = { Op2, Op1, CC, Cond }; + return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops)); +} + +// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or +// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart +// from the AND / OR. +static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { + Opc = Op.getOpcode(); + if (Opc != ISD::OR && Opc != ISD::AND) + return false; + return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && + Op.getOperand(0).hasOneUse() && + Op.getOperand(1).getOpcode() == X86ISD::SETCC && + Op.getOperand(1).hasOneUse()); +} + +// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and +// 1 and that the SETCC node has a single use. +static bool isXor1OfSetCC(SDValue Op) { + if (Op.getOpcode() != ISD::XOR) + return false; + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (N1C && N1C->getAPIntValue() == 1) { + return Op.getOperand(0).getOpcode() == X86ISD::SETCC && + Op.getOperand(0).hasOneUse(); + } + return false; +} + +SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { + bool addTest = true; + SDValue Chain = Op.getOperand(0); + SDValue Cond = Op.getOperand(1); + SDValue Dest = Op.getOperand(2); + DebugLoc dl = Op.getDebugLoc(); + SDValue CC; + + if (Cond.getOpcode() == ISD::SETCC) { + SDValue NewCond = LowerSETCC(Cond, DAG); + if (NewCond.getNode()) + Cond = NewCond; + } +#if 0 + // FIXME: LowerXALUO doesn't handle these!! + else if (Cond.getOpcode() == X86ISD::ADD || + Cond.getOpcode() == X86ISD::SUB || + Cond.getOpcode() == X86ISD::SMUL || + Cond.getOpcode() == X86ISD::UMUL) + Cond = LowerXALUO(Cond, DAG); +#endif + + // Look pass (and (setcc_carry (cmp ...)), 1). + if (Cond.getOpcode() == ISD::AND && + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); + if (C && C->getAPIntValue() == 1) + Cond = Cond.getOperand(0); + } + + // If condition flag is set by a X86ISD::CMP, then use it as the condition + // setting operand in place of the X86ISD::SETCC. + if (Cond.getOpcode() == X86ISD::SETCC || + Cond.getOpcode() == X86ISD::SETCC_CARRY) { + CC = Cond.getOperand(0); + + SDValue Cmp = Cond.getOperand(1); + unsigned Opc = Cmp.getOpcode(); + // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? + if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { + Cond = Cmp; + addTest = false; + } else { + switch (cast<ConstantSDNode>(CC)->getZExtValue()) { + default: break; + case X86::COND_O: + case X86::COND_B: + // These can only come from an arithmetic instruction with overflow, + // e.g. SADDO, UADDO. + Cond = Cond.getNode()->getOperand(1); + addTest = false; + break; + } + } + } else { + unsigned CondOpc; + if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { + SDValue Cmp = Cond.getOperand(0).getOperand(1); + if (CondOpc == ISD::OR) { + // Also, recognize the pattern generated by an FCMP_UNE. We can emit + // two branches instead of an explicit OR instruction with a + // separate test. + if (Cmp == Cond.getOperand(1).getOperand(1) && + isX86LogicalCmp(Cmp)) { + CC = Cond.getOperand(0).getOperand(0); + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + CC = Cond.getOperand(1).getOperand(0); + Cond = Cmp; + addTest = false; + } + } else { // ISD::AND + // Also, recognize the pattern generated by an FCMP_OEQ. We can emit + // two branches instead of an explicit AND instruction with a + // separate test. However, we only do this if this block doesn't + // have a fall-through edge, because this requires an explicit + // jmp when the condition is false. + if (Cmp == Cond.getOperand(1).getOperand(1) && + isX86LogicalCmp(Cmp) && + Op.getNode()->hasOneUse()) { + X86::CondCode CCode = + (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); + CCode = X86::GetOppositeBranchCondition(CCode); + CC = DAG.getConstant(CCode, MVT::i8); + SDNode *User = *Op.getNode()->use_begin(); + // Look for an unconditional branch following this conditional branch. + // We need this because we need to reverse the successors in order + // to implement FCMP_OEQ. + if (User->getOpcode() == ISD::BR) { + SDValue FalseBB = User->getOperand(1); + SDNode *NewBR = + DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); + assert(NewBR == User); + (void)NewBR; + Dest = FalseBB; + + Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cmp); + X86::CondCode CCode = + (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); + CCode = X86::GetOppositeBranchCondition(CCode); + CC = DAG.getConstant(CCode, MVT::i8); + Cond = Cmp; + addTest = false; + } + } + } + } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { + // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. + // It should be transformed during dag combiner except when the condition + // is set by a arithmetics with overflow node. + X86::CondCode CCode = + (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); + CCode = X86::GetOppositeBranchCondition(CCode); + CC = DAG.getConstant(CCode, MVT::i8); + Cond = Cond.getOperand(0).getOperand(1); + addTest = false; + } + } + + if (addTest) { + // Look pass the truncate. + if (Cond.getOpcode() == ISD::TRUNCATE) + Cond = Cond.getOperand(0); + + // We know the result of AND is compared against zero. Try to match + // it to BT. + if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { + SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); + if (NewSetCC.getNode()) { + CC = NewSetCC.getOperand(0); + Cond = NewSetCC.getOperand(1); + addTest = false; + } + } + } + + if (addTest) { + CC = DAG.getConstant(X86::COND_NE, MVT::i8); + Cond = EmitTest(Cond, X86::COND_NE, DAG); + } + return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), + Chain, Dest, CC, Cond); +} + + +// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. +// Calls to _alloca is needed to probe the stack when allocating more than 4k +// bytes in one go. Touching the stack at 4K increments is necessary to ensure +// that the guard pages used by the OS virtual memory manager are allocated in +// correct sequence. +SDValue +X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() || + EnableSegmentedStacks) && + "This should be used only on Windows targets or when segmented stacks " + "are being used"); + assert(!Subtarget->isTargetEnvMacho() && "Not implemented"); + DebugLoc dl = Op.getDebugLoc(); + + // Get the inputs. + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + // FIXME: Ensure alignment here + + bool Is64Bit = Subtarget->is64Bit(); + EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; + + if (EnableSegmentedStacks) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + if (Is64Bit) { + // The 64 bit implementation of segmented stacks needs to clobber both r10 + // r11. This makes it impossible to use it along with nested parameters. + const Function *F = MF.getFunction(); + + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; I++) + if (I->hasNestAttr()) + report_fatal_error("Cannot use segmented stacks with functions that " + "have nested arguments."); + } + + const TargetRegisterClass *AddrRegClass = + getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); + unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); + Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); + SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, + DAG.getRegister(Vreg, SPTy)); + SDValue Ops1[2] = { Value, Chain }; + return DAG.getMergeValues(Ops1, 2, dl); + } else { + SDValue Flag; + unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); + + Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); + Flag = Chain.getValue(1); + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + + Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); + Flag = Chain.getValue(1); + + Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); + + SDValue Ops1[2] = { Chain.getValue(0), Chain }; + return DAG.getMergeValues(Ops1, 2, dl); + } +} + +SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + DebugLoc DL = Op.getDebugLoc(); + + if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { + // vastart just stores the address of the VarArgsFrameIndex slot into the + // memory location argument. + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), + getPointerTy()); + return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), + MachinePointerInfo(SV), false, false, 0); + } + + // __va_list_tag: + // gp_offset (0 - 6 * 8) + // fp_offset (48 - 48 + 8 * 16) + // overflow_arg_area (point to parameters coming in memory). + // reg_save_area + SmallVector<SDValue, 8> MemOps; + SDValue FIN = Op.getOperand(1); + // Store gp_offset + SDValue Store = DAG.getStore(Op.getOperand(0), DL, + DAG.getConstant(FuncInfo->getVarArgsGPOffset(), + MVT::i32), + FIN, MachinePointerInfo(SV), false, false, 0); + MemOps.push_back(Store); + + // Store fp_offset + FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), + FIN, DAG.getIntPtrConstant(4)); + Store = DAG.getStore(Op.getOperand(0), DL, + DAG.getConstant(FuncInfo->getVarArgsFPOffset(), + MVT::i32), + FIN, MachinePointerInfo(SV, 4), false, false, 0); + MemOps.push_back(Store); + + // Store ptr to overflow_arg_area + FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), + FIN, DAG.getIntPtrConstant(4)); + SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), + getPointerTy()); + Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, + MachinePointerInfo(SV, 8), + false, false, 0); + MemOps.push_back(Store); + + // Store ptr to reg_save_area. + FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), + FIN, DAG.getIntPtrConstant(8)); + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), + getPointerTy()); + Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, + MachinePointerInfo(SV, 16), false, false, 0); + MemOps.push_back(Store); + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + &MemOps[0], MemOps.size()); +} + +SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->is64Bit() && + "LowerVAARG only handles 64-bit va_arg!"); + assert((Subtarget->isTargetLinux() || + Subtarget->isTargetDarwin()) && + "Unhandled target in LowerVAARG"); + assert(Op.getNode()->getNumOperands() == 4); + SDValue Chain = Op.getOperand(0); + SDValue SrcPtr = Op.getOperand(1); + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + unsigned Align = Op.getConstantOperandVal(3); + DebugLoc dl = Op.getDebugLoc(); + + EVT ArgVT = Op.getNode()->getValueType(0); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); + uint8_t ArgMode; + + // Decide which area this value should be read from. + // TODO: Implement the AMD64 ABI in its entirety. This simple + // selection mechanism works only for the basic types. + if (ArgVT == MVT::f80) { + llvm_unreachable("va_arg for f80 not yet implemented"); + } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { + ArgMode = 2; // Argument passed in XMM register. Use fp_offset. + } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { + ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. + } else { + llvm_unreachable("Unhandled argument type in LowerVAARG"); + } + + if (ArgMode == 2) { + // Sanity Check: Make sure using fp_offset makes sense. + assert(!UseSoftFloat && + !(DAG.getMachineFunction() + .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && + Subtarget->hasXMM()); + } + + // Insert VAARG_64 node into the DAG + // VAARG_64 returns two values: Variable Argument Address, Chain + SmallVector<SDValue, 11> InstOps; + InstOps.push_back(Chain); + InstOps.push_back(SrcPtr); + InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); + InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); + InstOps.push_back(DAG.getConstant(Align, MVT::i32)); + SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); + SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, + VTs, &InstOps[0], InstOps.size(), + MVT::i64, + MachinePointerInfo(SV), + /*Align=*/0, + /*Volatile=*/false, + /*ReadMem=*/true, + /*WriteMem=*/true); + Chain = VAARG.getValue(1); + + // Load the next argument and return it + return DAG.getLoad(ArgVT, dl, + Chain, + VAARG, + MachinePointerInfo(), + false, false, 0); +} + +SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { + // X86-64 va_list is a struct { i32, i32, i8*, i8* }. + assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); + SDValue Chain = Op.getOperand(0); + SDValue DstPtr = Op.getOperand(1); + SDValue SrcPtr = Op.getOperand(2); + const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); + const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); + DebugLoc DL = Op.getDebugLoc(); + + return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, + DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, + false, + MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); +} + +SDValue +X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. + // Comparison intrinsics. + case Intrinsic::x86_sse_comieq_ss: + case Intrinsic::x86_sse_comilt_ss: + case Intrinsic::x86_sse_comile_ss: + case Intrinsic::x86_sse_comigt_ss: + case Intrinsic::x86_sse_comige_ss: + case Intrinsic::x86_sse_comineq_ss: + case Intrinsic::x86_sse_ucomieq_ss: + case Intrinsic::x86_sse_ucomilt_ss: + case Intrinsic::x86_sse_ucomile_ss: + case Intrinsic::x86_sse_ucomigt_ss: + case Intrinsic::x86_sse_ucomige_ss: + case Intrinsic::x86_sse_ucomineq_ss: + case Intrinsic::x86_sse2_comieq_sd: + case Intrinsic::x86_sse2_comilt_sd: + case Intrinsic::x86_sse2_comile_sd: + case Intrinsic::x86_sse2_comigt_sd: + case Intrinsic::x86_sse2_comige_sd: + case Intrinsic::x86_sse2_comineq_sd: + case Intrinsic::x86_sse2_ucomieq_sd: + case Intrinsic::x86_sse2_ucomilt_sd: + case Intrinsic::x86_sse2_ucomile_sd: + case Intrinsic::x86_sse2_ucomigt_sd: + case Intrinsic::x86_sse2_ucomige_sd: + case Intrinsic::x86_sse2_ucomineq_sd: { + unsigned Opc = 0; + ISD::CondCode CC = ISD::SETCC_INVALID; + switch (IntNo) { + default: break; + case Intrinsic::x86_sse_comieq_ss: + case Intrinsic::x86_sse2_comieq_sd: + Opc = X86ISD::COMI; + CC = ISD::SETEQ; + break; + case Intrinsic::x86_sse_comilt_ss: + case Intrinsic::x86_sse2_comilt_sd: + Opc = X86ISD::COMI; + CC = ISD::SETLT; + break; + case Intrinsic::x86_sse_comile_ss: + case Intrinsic::x86_sse2_comile_sd: + Opc = X86ISD::COMI; + CC = ISD::SETLE; + break; + case Intrinsic::x86_sse_comigt_ss: + case Intrinsic::x86_sse2_comigt_sd: + Opc = X86ISD::COMI; + CC = ISD::SETGT; + break; + case Intrinsic::x86_sse_comige_ss: + case Intrinsic::x86_sse2_comige_sd: + Opc = X86ISD::COMI; + CC = ISD::SETGE; + break; + case Intrinsic::x86_sse_comineq_ss: + case Intrinsic::x86_sse2_comineq_sd: + Opc = X86ISD::COMI; + CC = ISD::SETNE; + break; + case Intrinsic::x86_sse_ucomieq_ss: + case Intrinsic::x86_sse2_ucomieq_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETEQ; + break; + case Intrinsic::x86_sse_ucomilt_ss: + case Intrinsic::x86_sse2_ucomilt_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETLT; + break; + case Intrinsic::x86_sse_ucomile_ss: + case Intrinsic::x86_sse2_ucomile_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETLE; + break; + case Intrinsic::x86_sse_ucomigt_ss: + case Intrinsic::x86_sse2_ucomigt_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETGT; + break; + case Intrinsic::x86_sse_ucomige_ss: + case Intrinsic::x86_sse2_ucomige_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETGE; + break; + case Intrinsic::x86_sse_ucomineq_ss: + case Intrinsic::x86_sse2_ucomineq_sd: + Opc = X86ISD::UCOMI; + CC = ISD::SETNE; + break; + } + + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); + assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); + SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, MVT::i8), Cond); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } + // Arithmetic intrinsics. + case Intrinsic::x86_sse3_hadd_ps: + case Intrinsic::x86_sse3_hadd_pd: + case Intrinsic::x86_avx_hadd_ps_256: + case Intrinsic::x86_avx_hadd_pd_256: + return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::x86_sse3_hsub_ps: + case Intrinsic::x86_sse3_hsub_pd: + case Intrinsic::x86_avx_hsub_ps_256: + case Intrinsic::x86_avx_hsub_pd_256: + return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + // ptest and testp intrinsics. The intrinsic these come from are designed to + // return an integer value, not just an instruction so lower it to the ptest + // or testp pattern and a setcc for the result. + case Intrinsic::x86_sse41_ptestz: + case Intrinsic::x86_sse41_ptestc: + case Intrinsic::x86_sse41_ptestnzc: + case Intrinsic::x86_avx_ptestz_256: + case Intrinsic::x86_avx_ptestc_256: + case Intrinsic::x86_avx_ptestnzc_256: + case Intrinsic::x86_avx_vtestz_ps: + case Intrinsic::x86_avx_vtestc_ps: + case Intrinsic::x86_avx_vtestnzc_ps: + case Intrinsic::x86_avx_vtestz_pd: + case Intrinsic::x86_avx_vtestc_pd: + case Intrinsic::x86_avx_vtestnzc_pd: + case Intrinsic::x86_avx_vtestz_ps_256: + case Intrinsic::x86_avx_vtestc_ps_256: + case Intrinsic::x86_avx_vtestnzc_ps_256: + case Intrinsic::x86_avx_vtestz_pd_256: + case Intrinsic::x86_avx_vtestc_pd_256: + case Intrinsic::x86_avx_vtestnzc_pd_256: { + bool IsTestPacked = false; + unsigned X86CC = 0; + switch (IntNo) { + default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); + case Intrinsic::x86_avx_vtestz_ps: + case Intrinsic::x86_avx_vtestz_pd: + case Intrinsic::x86_avx_vtestz_ps_256: + case Intrinsic::x86_avx_vtestz_pd_256: + IsTestPacked = true; // Fallthrough + case Intrinsic::x86_sse41_ptestz: + case Intrinsic::x86_avx_ptestz_256: + // ZF = 1 + X86CC = X86::COND_E; + break; + case Intrinsic::x86_avx_vtestc_ps: + case Intrinsic::x86_avx_vtestc_pd: + case Intrinsic::x86_avx_vtestc_ps_256: + case Intrinsic::x86_avx_vtestc_pd_256: + IsTestPacked = true; // Fallthrough + case Intrinsic::x86_sse41_ptestc: + case Intrinsic::x86_avx_ptestc_256: + // CF = 1 + X86CC = X86::COND_B; + break; + case Intrinsic::x86_avx_vtestnzc_ps: + case Intrinsic::x86_avx_vtestnzc_pd: + case Intrinsic::x86_avx_vtestnzc_ps_256: + case Intrinsic::x86_avx_vtestnzc_pd_256: + IsTestPacked = true; // Fallthrough + case Intrinsic::x86_sse41_ptestnzc: + case Intrinsic::x86_avx_ptestnzc_256: + // ZF and CF = 0 + X86CC = X86::COND_A; + break; + } + + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; + SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); + SDValue CC = DAG.getConstant(X86CC, MVT::i8); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } + + // Fix vector shift instructions where the last operand is a non-immediate + // i32 value. + case Intrinsic::x86_sse2_pslli_w: + case Intrinsic::x86_sse2_pslli_d: + case Intrinsic::x86_sse2_pslli_q: + case Intrinsic::x86_sse2_psrli_w: + case Intrinsic::x86_sse2_psrli_d: + case Intrinsic::x86_sse2_psrli_q: + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_sse2_psrai_d: + case Intrinsic::x86_mmx_pslli_w: + case Intrinsic::x86_mmx_pslli_d: + case Intrinsic::x86_mmx_pslli_q: + case Intrinsic::x86_mmx_psrli_w: + case Intrinsic::x86_mmx_psrli_d: + case Intrinsic::x86_mmx_psrli_q: + case Intrinsic::x86_mmx_psrai_w: + case Intrinsic::x86_mmx_psrai_d: { + SDValue ShAmt = Op.getOperand(2); + if (isa<ConstantSDNode>(ShAmt)) + return SDValue(); + + unsigned NewIntNo = 0; + EVT ShAmtVT = MVT::v4i32; + switch (IntNo) { + case Intrinsic::x86_sse2_pslli_w: + NewIntNo = Intrinsic::x86_sse2_psll_w; + break; + case Intrinsic::x86_sse2_pslli_d: + NewIntNo = Intrinsic::x86_sse2_psll_d; + break; + case Intrinsic::x86_sse2_pslli_q: + NewIntNo = Intrinsic::x86_sse2_psll_q; + break; + case Intrinsic::x86_sse2_psrli_w: + NewIntNo = Intrinsic::x86_sse2_psrl_w; + break; + case Intrinsic::x86_sse2_psrli_d: + NewIntNo = Intrinsic::x86_sse2_psrl_d; + break; + case Intrinsic::x86_sse2_psrli_q: + NewIntNo = Intrinsic::x86_sse2_psrl_q; + break; + case Intrinsic::x86_sse2_psrai_w: + NewIntNo = Intrinsic::x86_sse2_psra_w; + break; + case Intrinsic::x86_sse2_psrai_d: + NewIntNo = Intrinsic::x86_sse2_psra_d; + break; + default: { + ShAmtVT = MVT::v2i32; + switch (IntNo) { + case Intrinsic::x86_mmx_pslli_w: + NewIntNo = Intrinsic::x86_mmx_psll_w; + break; + case Intrinsic::x86_mmx_pslli_d: + NewIntNo = Intrinsic::x86_mmx_psll_d; + break; + case Intrinsic::x86_mmx_pslli_q: + NewIntNo = Intrinsic::x86_mmx_psll_q; + break; + case Intrinsic::x86_mmx_psrli_w: + NewIntNo = Intrinsic::x86_mmx_psrl_w; + break; + case Intrinsic::x86_mmx_psrli_d: + NewIntNo = Intrinsic::x86_mmx_psrl_d; + break; + case Intrinsic::x86_mmx_psrli_q: + NewIntNo = Intrinsic::x86_mmx_psrl_q; + break; + case Intrinsic::x86_mmx_psrai_w: + NewIntNo = Intrinsic::x86_mmx_psra_w; + break; + case Intrinsic::x86_mmx_psrai_d: + NewIntNo = Intrinsic::x86_mmx_psra_d; + break; + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + } + break; + } + } + + // The vector shift intrinsics with scalars uses 32b shift amounts but + // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits + // to be zero. + SDValue ShOps[4]; + ShOps[0] = ShAmt; + ShOps[1] = DAG.getConstant(0, MVT::i32); + if (ShAmtVT == MVT::v4i32) { + ShOps[2] = DAG.getUNDEF(MVT::i32); + ShOps[3] = DAG.getUNDEF(MVT::i32); + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); + } else { + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); +// FIXME this must be lowered to get rid of the invalid type. + } + + EVT VT = Op.getValueType(); + ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(NewIntNo, MVT::i32), + Op.getOperand(1), ShAmt); + } + } +} + +SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, + SelectionDAG &DAG) const { + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setReturnAddressIsTaken(true); + + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + DebugLoc dl = Op.getDebugLoc(); + + if (Depth > 0) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + SDValue Offset = + DAG.getConstant(TD->getPointerSize(), + Subtarget->is64Bit() ? MVT::i64 : MVT::i32); + return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, getPointerTy(), + FrameAddr, Offset), + MachinePointerInfo(), false, false, 0); + } + + // Just load the return address. + SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); + return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), + RetAddrFI, MachinePointerInfo(), false, false, 0); +} + +SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful + unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); + while (Depth--) + FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, + MachinePointerInfo(), + false, false, 0); + return FrameAddr; +} + +SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, + SelectionDAG &DAG) const { + return DAG.getIntPtrConstant(2*TD->getPointerSize()); +} + +SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SDValue Chain = Op.getOperand(0); + SDValue Offset = Op.getOperand(1); + SDValue Handler = Op.getOperand(2); + DebugLoc dl = Op.getDebugLoc(); + + SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, + Subtarget->is64Bit() ? X86::RBP : X86::EBP, + getPointerTy()); + unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); + + SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, + DAG.getIntPtrConstant(TD->getPointerSize())); + StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); + Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), + false, false, 0); + Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); + MF.getRegInfo().addLiveOut(StoreAddrReg); + + return DAG.getNode(X86ISD::EH_RETURN, dl, + MVT::Other, + Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); +} + +SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, + SelectionDAG &DAG) const { + return Op.getOperand(0); +} + +SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, + SelectionDAG &DAG) const { + SDValue Root = Op.getOperand(0); + SDValue Trmp = Op.getOperand(1); // trampoline + SDValue FPtr = Op.getOperand(2); // nested function + SDValue Nest = Op.getOperand(3); // 'nest' parameter value + DebugLoc dl = Op.getDebugLoc(); + + const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); + + if (Subtarget->is64Bit()) { + SDValue OutChains[6]; + + // Large code-model. + const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. + const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. + + const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10); + const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11); + + const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix + + // Load the pointer to the nested function into R11. + unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 + SDValue Addr = Trmp; + OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), + Addr, MachinePointerInfo(TrmpAddr), + false, false, 0); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(2, MVT::i64)); + OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, + MachinePointerInfo(TrmpAddr, 2), + false, false, 2); + + // Load the 'nest' parameter value into R10. + // R10 is specified in X86CallingConv.td + OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(10, MVT::i64)); + OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), + Addr, MachinePointerInfo(TrmpAddr, 10), + false, false, 0); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(12, MVT::i64)); + OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, + MachinePointerInfo(TrmpAddr, 12), + false, false, 2); + + // Jump to the nested function. + OpCode = (JMP64r << 8) | REX_WB; // jmpq *... + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(20, MVT::i64)); + OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), + Addr, MachinePointerInfo(TrmpAddr, 20), + false, false, 0); + + unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(22, MVT::i64)); + OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, + MachinePointerInfo(TrmpAddr, 22), + false, false, 0); + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6); + } else { + const Function *Func = + cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); + CallingConv::ID CC = Func->getCallingConv(); + unsigned NestReg; + + switch (CC) { + default: + llvm_unreachable("Unsupported calling convention"); + case CallingConv::C: + case CallingConv::X86_StdCall: { + // Pass 'nest' parameter in ECX. + // Must be kept in sync with X86CallingConv.td + NestReg = X86::ECX; + + // Check that ECX wasn't needed by an 'inreg' parameter. + FunctionType *FTy = Func->getFunctionType(); + const AttrListPtr &Attrs = Func->getAttributes(); + + if (!Attrs.isEmpty() && !Func->isVarArg()) { + unsigned InRegCount = 0; + unsigned Idx = 1; + + for (FunctionType::param_iterator I = FTy->param_begin(), + E = FTy->param_end(); I != E; ++I, ++Idx) + if (Attrs.paramHasAttr(Idx, Attribute::InReg)) + // FIXME: should only count parameters that are lowered to integers. + InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; + + if (InRegCount > 2) { + report_fatal_error("Nest register in use - reduce number of inreg" + " parameters!"); + } + } + break; + } + case CallingConv::X86_FastCall: + case CallingConv::X86_ThisCall: + case CallingConv::Fast: + // Pass 'nest' parameter in EAX. + // Must be kept in sync with X86CallingConv.td + NestReg = X86::EAX; + break; + } + + SDValue OutChains[4]; + SDValue Addr, Disp; + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(10, MVT::i32)); + Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); + + // This is storing the opcode for MOV32ri. + const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. + const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg); + OutChains[0] = DAG.getStore(Root, dl, + DAG.getConstant(MOV32ri|N86Reg, MVT::i8), + Trmp, MachinePointerInfo(TrmpAddr), + false, false, 0); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(1, MVT::i32)); + OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, + MachinePointerInfo(TrmpAddr, 1), + false, false, 1); + + const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(5, MVT::i32)); + OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, + MachinePointerInfo(TrmpAddr, 5), + false, false, 1); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, + DAG.getConstant(6, MVT::i32)); + OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, + MachinePointerInfo(TrmpAddr, 6), + false, false, 1); + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4); + } +} + +SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, + SelectionDAG &DAG) const { + /* + The rounding mode is in bits 11:10 of FPSR, and has the following + settings: + 00 Round to nearest + 01 Round to -inf + 10 Round to +inf + 11 Round to 0 + + FLT_ROUNDS, on the other hand, expects the following: + -1 Undefined + 0 Round to 0 + 1 Round to nearest + 2 Round to +inf + 3 Round to -inf + + To perform the conversion, we do: + (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) + */ + + MachineFunction &MF = DAG.getMachineFunction(); + const TargetMachine &TM = MF.getTarget(); + const TargetFrameLowering &TFI = *TM.getFrameLowering(); + unsigned StackAlignment = TFI.getStackAlignment(); + EVT VT = Op.getValueType(); + DebugLoc DL = Op.getDebugLoc(); + + // Save FP Control Word to stack slot + int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); + SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); + + + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), + MachineMemOperand::MOStore, 2, 2); + + SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; + SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, + DAG.getVTList(MVT::Other), + Ops, 2, MVT::i16, MMO); + + // Load FP Control Word from stack slot + SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, + MachinePointerInfo(), false, false, 0); + + // Transform as necessary + SDValue CWD1 = + DAG.getNode(ISD::SRL, DL, MVT::i16, + DAG.getNode(ISD::AND, DL, MVT::i16, + CWD, DAG.getConstant(0x800, MVT::i16)), + DAG.getConstant(11, MVT::i8)); + SDValue CWD2 = + DAG.getNode(ISD::SRL, DL, MVT::i16, + DAG.getNode(ISD::AND, DL, MVT::i16, + CWD, DAG.getConstant(0x400, MVT::i16)), + DAG.getConstant(9, MVT::i8)); + + SDValue RetVal = + DAG.getNode(ISD::AND, DL, MVT::i16, + DAG.getNode(ISD::ADD, DL, MVT::i16, + DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), + DAG.getConstant(1, MVT::i16)), + DAG.getConstant(3, MVT::i16)); + + + return DAG.getNode((VT.getSizeInBits() < 16 ? + ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); +} + +SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + EVT OpVT = VT; + unsigned NumBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); + + Op = Op.getOperand(0); + if (VT == MVT::i8) { + // Zero extend to i32 since there is not an i8 bsr. + OpVT = MVT::i32; + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); + } + + // Issue a bsr (scan bits in reverse) which also sets EFLAGS. + SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); + Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); + + // If src is zero (i.e. bsr sets ZF), returns NumBits. + SDValue Ops[] = { + Op, + DAG.getConstant(NumBits+NumBits-1, OpVT), + DAG.getConstant(X86::COND_E, MVT::i8), + Op.getValue(1) + }; + Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); + + // Finally xor with NumBits-1. + Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); + + if (VT == MVT::i8) + Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); + return Op; +} + +SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + EVT OpVT = VT; + unsigned NumBits = VT.getSizeInBits(); + DebugLoc dl = Op.getDebugLoc(); + + Op = Op.getOperand(0); + if (VT == MVT::i8) { + OpVT = MVT::i32; + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); + } + + // Issue a bsf (scan bits forward) which also sets EFLAGS. + SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); + Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); + + // If src is zero (i.e. bsf sets ZF), returns NumBits. + SDValue Ops[] = { + Op, + DAG.getConstant(NumBits, OpVT), + DAG.getConstant(X86::COND_E, MVT::i8), + Op.getValue(1) + }; + Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); + + if (VT == MVT::i8) + Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); + return Op; +} + +// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit +// ones, and then concatenate the result back. +static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + assert(VT.getSizeInBits() == 256 && VT.isInteger() && + "Unsupported value type for operation"); + + int NumElems = VT.getVectorNumElements(); + DebugLoc dl = Op.getDebugLoc(); + SDValue Idx0 = DAG.getConstant(0, MVT::i32); + SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32); + + // Extract the LHS vectors + SDValue LHS = Op.getOperand(0); + SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl); + SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl); + + // Extract the RHS vectors + SDValue RHS = Op.getOperand(1); + SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl); + SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl); + + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), + DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); +} + +SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getValueType().getSizeInBits() == 256 && + Op.getValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return Lower256IntArith(Op, DAG); +} + +SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getValueType().getSizeInBits() == 256 && + Op.getValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return Lower256IntArith(Op, DAG); +} + +SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + // Decompose 256-bit ops into smaller 128-bit ops. + if (VT.getSizeInBits() == 256) + return Lower256IntArith(Op, DAG); + + assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); + DebugLoc dl = Op.getDebugLoc(); + + // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); + // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); + // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); + // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); + // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); + // + // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); + // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); + // return AloBlo + AloBhi + AhiBlo; + + SDValue A = Op.getOperand(0); + SDValue B = Op.getOperand(1); + + SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), + A, DAG.getConstant(32, MVT::i32)); + SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), + B, DAG.getConstant(32, MVT::i32)); + SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), + A, B); + SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), + A, Bhi); + SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), + Ahi, B); + AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), + AloBhi, DAG.getConstant(32, MVT::i32)); + AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), + AhiBlo, DAG.getConstant(32, MVT::i32)); + SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); + Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); + return Res; +} + +SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { + + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + LLVMContext *Context = DAG.getContext(); + + if (!Subtarget->hasXMMInt()) + return SDValue(); + + // Decompose 256-bit shifts into smaller 128-bit shifts. + if (VT.getSizeInBits() == 256) { + int NumElems = VT.getVectorNumElements(); + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + + // Extract the two vectors + SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl); + SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); + + // Recreate the shift amount vectors + SDValue Amt1, Amt2; + if (Amt.getOpcode() == ISD::BUILD_VECTOR) { + // Constant shift amount + SmallVector<SDValue, 4> Amt1Csts; + SmallVector<SDValue, 4> Amt2Csts; + for (int i = 0; i < NumElems/2; ++i) + Amt1Csts.push_back(Amt->getOperand(i)); + for (int i = NumElems/2; i < NumElems; ++i) + Amt2Csts.push_back(Amt->getOperand(i)); + + Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, + &Amt1Csts[0], NumElems/2); + Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, + &Amt2Csts[0], NumElems/2); + } else { + // Variable shift amount + Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl); + Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); + } + + // Issue new vector shifts for the smaller types + V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1); + V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2); + + // Concatenate the result back + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2); + } + + // Optimize shl/srl/sra with constant shift amount. + if (isSplatVector(Amt.getNode())) { + SDValue SclrAmt = Amt->getOperand(0); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) { + uint64_t ShiftAmt = C->getZExtValue(); + + if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + + if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), + R, DAG.getConstant(ShiftAmt, MVT::i32)); + } + } + + // Lower SHL with variable shift amount. + if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) { + Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), + Op.getOperand(1), DAG.getConstant(23, MVT::i32)); + + ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); + + std::vector<Constant*> CV(4, CI); + Constant *C = ConstantVector::get(CV); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, 16); + + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); + Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op); + Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); + return DAG.getNode(ISD::MUL, dl, VT, Op, R); + } + if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) { + // a = a << 5; + Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), + Op.getOperand(1), DAG.getConstant(5, MVT::i32)); + + ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); + ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); + + std::vector<Constant*> CVM1(16, CM1); + std::vector<Constant*> CVM2(16, CM2); + Constant *C = ConstantVector::get(CVM1); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, 16); + + // r = pblendv(r, psllw(r & (char16)15, 4), a); + M = DAG.getNode(ISD::AND, dl, VT, R, M); + M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, + DAG.getConstant(4, MVT::i32)); + R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M); + // a += a + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); + + C = ConstantVector::get(CVM2); + CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, 16); + + // r = pblendv(r, psllw(r & (char16)63, 2), a); + M = DAG.getNode(ISD::AND, dl, VT, R, M); + M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, + DAG.getConstant(2, MVT::i32)); + R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M); + // a += a + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); + + // return pblendv(r, r+r, a); + R = DAG.getNode(ISD::VSELECT, dl, VT, Op, + R, DAG.getNode(ISD::ADD, dl, VT, R, R)); + return R; + } + return SDValue(); +} + +SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { + // Lower the "add/sub/mul with overflow" instruction into a regular ins plus + // a "setcc" instruction that checks the overflow flag. The "brcond" lowering + // looks for this combo and may remove the "setcc" instruction if the "setcc" + // has only one use. + SDNode *N = Op.getNode(); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + unsigned BaseOp = 0; + unsigned Cond = 0; + DebugLoc DL = Op.getDebugLoc(); + switch (Op.getOpcode()) { + default: llvm_unreachable("Unknown ovf instruction!"); + case ISD::SADDO: + // A subtract of one will be selected as a INC. Note that INC doesn't + // set CF, so we can't do this for UADDO. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) + if (C->isOne()) { + BaseOp = X86ISD::INC; + Cond = X86::COND_O; + break; + } + BaseOp = X86ISD::ADD; + Cond = X86::COND_O; + break; + case ISD::UADDO: + BaseOp = X86ISD::ADD; + Cond = X86::COND_B; + break; + case ISD::SSUBO: + // A subtract of one will be selected as a DEC. Note that DEC doesn't + // set CF, so we can't do this for USUBO. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) + if (C->isOne()) { + BaseOp = X86ISD::DEC; + Cond = X86::COND_O; + break; + } + BaseOp = X86ISD::SUB; + Cond = X86::COND_O; + break; + case ISD::USUBO: + BaseOp = X86ISD::SUB; + Cond = X86::COND_B; + break; + case ISD::SMULO: + BaseOp = X86ISD::SMUL; + Cond = X86::COND_O; + break; + case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs + SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), + MVT::i32); + SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); + + SDValue SetCC = + DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(X86::COND_O, MVT::i32), + SDValue(Sum.getNode(), 2)); + + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); + } + } + + // Also sets EFLAGS. + SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); + SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); + + SDValue SetCC = + DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1), + DAG.getConstant(Cond, MVT::i32), + SDValue(Sum.getNode(), 1)); + + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); +} + +SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{ + DebugLoc dl = Op.getDebugLoc(); + SDNode* Node = Op.getNode(); + EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT(); + EVT VT = Node->getValueType(0); + if (Subtarget->hasXMMInt() && VT.isVector()) { + unsigned BitsDiff = VT.getScalarType().getSizeInBits() - + ExtraVT.getScalarType().getSizeInBits(); + SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32); + + unsigned SHLIntrinsicsID = 0; + unsigned SRAIntrinsicsID = 0; + switch (VT.getSimpleVT().SimpleTy) { + default: + return SDValue(); + case MVT::v4i32: { + SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d; + SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d; + break; + } + case MVT::v8i16: { + SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w; + SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w; + break; + } + } + + SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(SHLIntrinsicsID, MVT::i32), + Node->getOperand(0), ShAmt); + + // In case of 1 bit sext, no need to shr + if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1; + + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(SRAIntrinsicsID, MVT::i32), + Tmp1, ShAmt); + } + + return SDValue(); +} + + +SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ + DebugLoc dl = Op.getDebugLoc(); + + // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. + // There isn't any reason to disable it if the target processor supports it. + if (!Subtarget->hasXMMInt() && !Subtarget->is64Bit()) { + SDValue Chain = Op.getOperand(0); + SDValue Zero = DAG.getConstant(0, MVT::i32); + SDValue Ops[] = { + DAG.getRegister(X86::ESP, MVT::i32), // Base + DAG.getTargetConstant(1, MVT::i8), // Scale + DAG.getRegister(0, MVT::i32), // Index + DAG.getTargetConstant(0, MVT::i32), // Disp + DAG.getRegister(0, MVT::i32), // Segment. + Zero, + Chain + }; + SDNode *Res = + DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, + array_lengthof(Ops)); + return SDValue(Res, 0); + } + + unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); + if (!isDev) + return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); + + unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); + unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); + + // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; + if (!Op1 && !Op2 && !Op3 && Op4) + return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); + + // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; + if (Op1 && !Op2 && !Op3 && !Op4) + return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); + + // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), + // (MFENCE)>; + return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); +} + +SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc dl = Op.getDebugLoc(); + AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( + cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); + SynchronizationScope FenceScope = static_cast<SynchronizationScope>( + cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); + + // The only fence that needs an instruction is a sequentially-consistent + // cross-thread fence. + if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { + // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for + // no-sse2). There isn't any reason to disable it if the target processor + // supports it. + if (Subtarget->hasXMMInt() || Subtarget->is64Bit()) + return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); + + SDValue Chain = Op.getOperand(0); + SDValue Zero = DAG.getConstant(0, MVT::i32); + SDValue Ops[] = { + DAG.getRegister(X86::ESP, MVT::i32), // Base + DAG.getTargetConstant(1, MVT::i8), // Scale + DAG.getRegister(0, MVT::i32), // Index + DAG.getTargetConstant(0, MVT::i32), // Disp + DAG.getRegister(0, MVT::i32), // Segment. + Zero, + Chain + }; + SDNode *Res = + DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, + array_lengthof(Ops)); + return SDValue(Res, 0); + } + + // MEMBARRIER is a compiler barrier; it codegens to a no-op. + return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); +} + + +SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { + EVT T = Op.getValueType(); + DebugLoc DL = Op.getDebugLoc(); + unsigned Reg = 0; + unsigned size = 0; + switch(T.getSimpleVT().SimpleTy) { + default: + assert(false && "Invalid value type!"); + case MVT::i8: Reg = X86::AL; size = 1; break; + case MVT::i16: Reg = X86::AX; size = 2; break; + case MVT::i32: Reg = X86::EAX; size = 4; break; + case MVT::i64: + assert(Subtarget->is64Bit() && "Node not type legal!"); + Reg = X86::RAX; size = 8; + break; + } + SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, + Op.getOperand(2), SDValue()); + SDValue Ops[] = { cpIn.getValue(0), + Op.getOperand(1), + Op.getOperand(3), + DAG.getTargetConstant(size, MVT::i8), + cpIn.getValue(1) }; + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); + SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, + Ops, 5, T, MMO); + SDValue cpOut = + DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); + return cpOut; +} + +SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget->is64Bit() && "Result not type legalized?"); + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue TheChain = Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); + SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); + SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); + SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, + rax.getValue(2)); + SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, + DAG.getConstant(32, MVT::i8)); + SDValue Ops[] = { + DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), + rdx.getValue(1) + }; + return DAG.getMergeValues(Ops, 2, dl); +} + +SDValue X86TargetLowering::LowerBITCAST(SDValue Op, + SelectionDAG &DAG) const { + EVT SrcVT = Op.getOperand(0).getValueType(); + EVT DstVT = Op.getValueType(); + assert(Subtarget->is64Bit() && !Subtarget->hasXMMInt() && + Subtarget->hasMMX() && "Unexpected custom BITCAST"); + assert((DstVT == MVT::i64 || + (DstVT.isVector() && DstVT.getSizeInBits()==64)) && + "Unexpected custom BITCAST"); + // i64 <=> MMX conversions are Legal. + if (SrcVT==MVT::i64 && DstVT.isVector()) + return Op; + if (DstVT==MVT::i64 && SrcVT.isVector()) + return Op; + // MMX <=> MMX conversions are Legal. + if (SrcVT.isVector() && DstVT.isVector()) + return Op; + // All other conversions need to be expanded. + return SDValue(); +} + +SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { + SDNode *Node = Op.getNode(); + DebugLoc dl = Node->getDebugLoc(); + EVT T = Node->getValueType(0); + SDValue negOp = DAG.getNode(ISD::SUB, dl, T, + DAG.getConstant(0, T), Node->getOperand(2)); + return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, + cast<AtomicSDNode>(Node)->getMemoryVT(), + Node->getOperand(0), + Node->getOperand(1), negOp, + cast<AtomicSDNode>(Node)->getSrcValue(), + cast<AtomicSDNode>(Node)->getAlignment(), + cast<AtomicSDNode>(Node)->getOrdering(), + cast<AtomicSDNode>(Node)->getSynchScope()); +} + +static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { + SDNode *Node = Op.getNode(); + DebugLoc dl = Node->getDebugLoc(); + EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); + + // Convert seq_cst store -> xchg + // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) + // FIXME: On 32-bit, store -> fist or movq would be more efficient + // (The only way to get a 16-byte store is cmpxchg16b) + // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. + if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent || + !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, + cast<AtomicSDNode>(Node)->getMemoryVT(), + Node->getOperand(0), + Node->getOperand(1), Node->getOperand(2), + cast<AtomicSDNode>(Node)->getMemOperand(), + cast<AtomicSDNode>(Node)->getOrdering(), + cast<AtomicSDNode>(Node)->getSynchScope()); + return Swap.getValue(1); + } + // Other atomic stores have a simple pattern. + return Op; +} + +static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getNode()->getValueType(0); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + + unsigned Opc; + bool ExtraOp = false; + switch (Op.getOpcode()) { + default: assert(0 && "Invalid code"); + case ISD::ADDC: Opc = X86ISD::ADD; break; + case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break; + case ISD::SUBC: Opc = X86ISD::SUB; break; + case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break; + } + + if (!ExtraOp) + return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + Op.getOperand(1)); + return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0), + Op.getOperand(1), Op.getOperand(2)); +} + +/// LowerOperation - Provide custom lowering hooks for some operations. +/// +SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: llvm_unreachable("Should not custom lower this!"); + case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); + case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); + case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op,DAG); + case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); + case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); + case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); + case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); + case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); + case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); + case ISD::ConstantPool: return LowerConstantPool(Op, DAG); + case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); + case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); + case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); + case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); + case ISD::SHL_PARTS: + case ISD::SRA_PARTS: + case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); + case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); + case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); + case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::FABS: return LowerFABS(Op, DAG); + case ISD::FNEG: return LowerFNEG(Op, DAG); + case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); + case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); + case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::JumpTable: return LowerJumpTable(Op, DAG); + case ISD::VASTART: return LowerVASTART(Op, DAG); + case ISD::VAARG: return LowerVAARG(Op, DAG); + case ISD::VACOPY: return LowerVACOPY(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); + case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); + case ISD::FRAME_TO_ARGS_OFFSET: + return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); + case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); + case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); + case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); + case ISD::CTLZ: return LowerCTLZ(Op, DAG); + case ISD::CTTZ: return LowerCTTZ(Op, DAG); + case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::SRA: + case ISD::SRL: + case ISD::SHL: return LowerShift(Op, DAG); + case ISD::SADDO: + case ISD::UADDO: + case ISD::SSUBO: + case ISD::USUBO: + case ISD::SMULO: + case ISD::UMULO: return LowerXALUO(Op, DAG); + case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); + case ISD::BITCAST: return LowerBITCAST(Op, DAG); + case ISD::ADDC: + case ISD::ADDE: + case ISD::SUBC: + case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); + case ISD::ADD: return LowerADD(Op, DAG); + case ISD::SUB: return LowerSUB(Op, DAG); + } +} + +static void ReplaceATOMIC_LOAD(SDNode *Node, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) { + DebugLoc dl = Node->getDebugLoc(); + EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); + + // Convert wide load -> cmpxchg8b/cmpxchg16b + // FIXME: On 32-bit, load -> fild or movq would be more efficient + // (The only way to get a 16-byte load is cmpxchg16b) + // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. + SDValue Zero = DAG.getConstant(0, VT); + SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT, + Node->getOperand(0), + Node->getOperand(1), Zero, Zero, + cast<AtomicSDNode>(Node)->getMemOperand(), + cast<AtomicSDNode>(Node)->getOrdering(), + cast<AtomicSDNode>(Node)->getSynchScope()); + Results.push_back(Swap.getValue(0)); + Results.push_back(Swap.getValue(1)); +} + +void X86TargetLowering:: +ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG, unsigned NewOp) const { + DebugLoc dl = Node->getDebugLoc(); + assert (Node->getValueType(0) == MVT::i64 && + "Only know how to expand i64 atomics"); + + SDValue Chain = Node->getOperand(0); + SDValue In1 = Node->getOperand(1); + SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + Node->getOperand(2), DAG.getIntPtrConstant(0)); + SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, + Node->getOperand(2), DAG.getIntPtrConstant(1)); + SDValue Ops[] = { Chain, In1, In2L, In2H }; + SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); + SDValue Result = + DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, + cast<MemSDNode>(Node)->getMemOperand()); + SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); + Results.push_back(Result.getValue(2)); +} + +/// ReplaceNodeResults - Replace a node with an illegal result type +/// with a new node built out of custom code. +void X86TargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG) const { + DebugLoc dl = N->getDebugLoc(); + switch (N->getOpcode()) { + default: + assert(false && "Do not know how to custom type legalize this operation!"); + return; + case ISD::SIGN_EXTEND_INREG: + case ISD::ADDC: + case ISD::ADDE: + case ISD::SUBC: + case ISD::SUBE: + // We don't want to expand or promote these. + return; + case ISD::FP_TO_SINT: { + std::pair<SDValue,SDValue> Vals = + FP_TO_INTHelper(SDValue(N, 0), DAG, true); + SDValue FIST = Vals.first, StackSlot = Vals.second; + if (FIST.getNode() != 0) { + EVT VT = N->getValueType(0); + // Return a load from the stack slot. + Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, + MachinePointerInfo(), false, false, 0)); + } + return; + } + case ISD::READCYCLECOUNTER: { + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue TheChain = N->getOperand(0); + SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); + SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, + rd.getValue(1)); + SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, + eax.getValue(2)); + // Use a buildpair to merge the two 32-bit values into a 64-bit one. + SDValue Ops[] = { eax, edx }; + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); + Results.push_back(edx.getValue(1)); + return; + } + case ISD::ATOMIC_CMP_SWAP: { + EVT T = N->getValueType(0); + assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); + bool Regs64bit = T == MVT::i128; + EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; + SDValue cpInL, cpInH; + cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), + DAG.getConstant(0, HalfT)); + cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), + DAG.getConstant(1, HalfT)); + cpInL = DAG.getCopyToReg(N->getOperand(0), dl, + Regs64bit ? X86::RAX : X86::EAX, + cpInL, SDValue()); + cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, + Regs64bit ? X86::RDX : X86::EDX, + cpInH, cpInL.getValue(1)); + SDValue swapInL, swapInH; + swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), + DAG.getConstant(0, HalfT)); + swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3), + DAG.getConstant(1, HalfT)); + swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, + Regs64bit ? X86::RBX : X86::EBX, + swapInL, cpInH.getValue(1)); + swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, + Regs64bit ? X86::RCX : X86::ECX, + swapInH, swapInL.getValue(1)); + SDValue Ops[] = { swapInH.getValue(0), + N->getOperand(1), + swapInH.getValue(1) }; + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); + unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG : + X86ISD::LCMPXCHG8_DAG; + SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, + Ops, 3, T, MMO); + SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, + Regs64bit ? X86::RAX : X86::EAX, + HalfT, Result.getValue(1)); + SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, + Regs64bit ? X86::RDX : X86::EDX, + HalfT, cpOutL.getValue(2)); + SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2)); + Results.push_back(cpOutH.getValue(1)); + return; + } + case ISD::ATOMIC_LOAD_ADD: + ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); + return; + case ISD::ATOMIC_LOAD_AND: + ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); + return; + case ISD::ATOMIC_LOAD_NAND: + ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); + return; + case ISD::ATOMIC_LOAD_OR: + ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); + return; + case ISD::ATOMIC_LOAD_SUB: + ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); + return; + case ISD::ATOMIC_LOAD_XOR: + ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); + return; + case ISD::ATOMIC_SWAP: + ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); + return; + case ISD::ATOMIC_LOAD: + ReplaceATOMIC_LOAD(N, Results, DAG); + } +} + +const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: return NULL; + case X86ISD::BSF: return "X86ISD::BSF"; + case X86ISD::BSR: return "X86ISD::BSR"; + case X86ISD::SHLD: return "X86ISD::SHLD"; + case X86ISD::SHRD: return "X86ISD::SHRD"; + case X86ISD::FAND: return "X86ISD::FAND"; + case X86ISD::FOR: return "X86ISD::FOR"; + case X86ISD::FXOR: return "X86ISD::FXOR"; + case X86ISD::FSRL: return "X86ISD::FSRL"; + case X86ISD::FILD: return "X86ISD::FILD"; + case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; + case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; + case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; + case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; + case X86ISD::FLD: return "X86ISD::FLD"; + case X86ISD::FST: return "X86ISD::FST"; + case X86ISD::CALL: return "X86ISD::CALL"; + case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; + case X86ISD::BT: return "X86ISD::BT"; + case X86ISD::CMP: return "X86ISD::CMP"; + case X86ISD::COMI: return "X86ISD::COMI"; + case X86ISD::UCOMI: return "X86ISD::UCOMI"; + case X86ISD::SETCC: return "X86ISD::SETCC"; + case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; + case X86ISD::FSETCCsd: return "X86ISD::FSETCCsd"; + case X86ISD::FSETCCss: return "X86ISD::FSETCCss"; + case X86ISD::CMOV: return "X86ISD::CMOV"; + case X86ISD::BRCOND: return "X86ISD::BRCOND"; + case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; + case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; + case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; + case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; + case X86ISD::Wrapper: return "X86ISD::Wrapper"; + case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; + case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; + case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; + case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; + case X86ISD::PINSRB: return "X86ISD::PINSRB"; + case X86ISD::PINSRW: return "X86ISD::PINSRW"; + case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; + case X86ISD::ANDNP: return "X86ISD::ANDNP"; + case X86ISD::PSIGNB: return "X86ISD::PSIGNB"; + case X86ISD::PSIGNW: return "X86ISD::PSIGNW"; + case X86ISD::PSIGND: return "X86ISD::PSIGND"; + case X86ISD::FMAX: return "X86ISD::FMAX"; + case X86ISD::FMIN: return "X86ISD::FMIN"; + case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; + case X86ISD::FRCP: return "X86ISD::FRCP"; + case X86ISD::FHADD: return "X86ISD::FHADD"; + case X86ISD::FHSUB: return "X86ISD::FHSUB"; + case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; + case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; + case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; + case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; + case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; + case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; + case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; + case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; + case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; + case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; + case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; + case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; + case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; + case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; + case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VSHL: return "X86ISD::VSHL"; + case X86ISD::VSRL: return "X86ISD::VSRL"; + case X86ISD::CMPPD: return "X86ISD::CMPPD"; + case X86ISD::CMPPS: return "X86ISD::CMPPS"; + case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; + case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; + case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; + case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; + case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; + case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; + case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; + case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; + case X86ISD::ADD: return "X86ISD::ADD"; + case X86ISD::SUB: return "X86ISD::SUB"; + case X86ISD::ADC: return "X86ISD::ADC"; + case X86ISD::SBB: return "X86ISD::SBB"; + case X86ISD::SMUL: return "X86ISD::SMUL"; + case X86ISD::UMUL: return "X86ISD::UMUL"; + case X86ISD::INC: return "X86ISD::INC"; + case X86ISD::DEC: return "X86ISD::DEC"; + case X86ISD::OR: return "X86ISD::OR"; + case X86ISD::XOR: return "X86ISD::XOR"; + case X86ISD::AND: return "X86ISD::AND"; + case X86ISD::ANDN: return "X86ISD::ANDN"; + case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; + case X86ISD::PTEST: return "X86ISD::PTEST"; + case X86ISD::TESTP: return "X86ISD::TESTP"; + case X86ISD::PALIGN: return "X86ISD::PALIGN"; + case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; + case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; + case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; + case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; + case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; + case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; + case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; + case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; + case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; + case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; + case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; + case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; + case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; + case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; + case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; + case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; + case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; + case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; + case X86ISD::MOVSD: return "X86ISD::MOVSD"; + case X86ISD::MOVSS: return "X86ISD::MOVSS"; + case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; + case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; + case X86ISD::VUNPCKLPDY: return "X86ISD::VUNPCKLPDY"; + case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; + case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; + case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; + case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; + case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; + case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; + case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; + case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; + case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; + case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; + case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VPERMILPS: return "X86ISD::VPERMILPS"; + case X86ISD::VPERMILPSY: return "X86ISD::VPERMILPSY"; + case X86ISD::VPERMILPD: return "X86ISD::VPERMILPD"; + case X86ISD::VPERMILPDY: return "X86ISD::VPERMILPDY"; + case X86ISD::VPERM2F128: return "X86ISD::VPERM2F128"; + case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; + case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; + case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; + case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; + case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; + } +} + +// isLegalAddressingMode - Return true if the addressing mode represented +// by AM is legal for this target, for a load/store of the specified type. +bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, + Type *Ty) const { + // X86 supports extremely general addressing modes. + CodeModel::Model M = getTargetMachine().getCodeModel(); + Reloc::Model R = getTargetMachine().getRelocationModel(); + + // X86 allows a sign-extended 32-bit immediate field as a displacement. + if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) + return false; + + if (AM.BaseGV) { + unsigned GVFlags = + Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); + + // If a reference to this global requires an extra load, we can't fold it. + if (isGlobalStubReference(GVFlags)) + return false; + + // If BaseGV requires a register for the PIC base, we cannot also have a + // BaseReg specified. + if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) + return false; + + // If lower 4G is not available, then we must use rip-relative addressing. + if ((M != CodeModel::Small || R != Reloc::Static) && + Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) + return false; + } + + switch (AM.Scale) { + case 0: + case 1: + case 2: + case 4: + case 8: + // These scales always work. + break; + case 3: + case 5: + case 9: + // These scales are formed with basereg+scalereg. Only accept if there is + // no basereg yet. + if (AM.HasBaseReg) + return false; + break; + default: // Other stuff never works. + return false; + } + + return true; +} + + +bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); + unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); + if (NumBits1 <= NumBits2) + return false; + return true; +} + +bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { + if (!VT1.isInteger() || !VT2.isInteger()) + return false; + unsigned NumBits1 = VT1.getSizeInBits(); + unsigned NumBits2 = VT2.getSizeInBits(); + if (NumBits1 <= NumBits2) + return false; + return true; +} + +bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { + // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. + return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); +} + +bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { + // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. + return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); +} + +bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { + // i16 instructions are longer (0x66 prefix) and potentially slower. + return !(VT1 == MVT::i32 && VT2 == MVT::i16); +} + +/// isShuffleMaskLegal - Targets can use this to indicate that they only +/// support *some* VECTOR_SHUFFLE operations, those with specific masks. +/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values +/// are assumed to be legal. +bool +X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, + EVT VT) const { + // Very little shuffling can be done for 64-bit vectors right now. + if (VT.getSizeInBits() == 64) + return isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()); + + // FIXME: pshufb, blends, shifts. + return (VT.getVectorNumElements() == 2 || + ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + isMOVLMask(M, VT) || + isSHUFPMask(M, VT) || + isPSHUFDMask(M, VT) || + isPSHUFHWMask(M, VT) || + isPSHUFLWMask(M, VT) || + isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()) || + isUNPCKLMask(M, VT) || + isUNPCKHMask(M, VT) || + isUNPCKL_v_undef_Mask(M, VT) || + isUNPCKH_v_undef_Mask(M, VT)); +} + +bool +X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, + EVT VT) const { + unsigned NumElts = VT.getVectorNumElements(); + // FIXME: This collection of masks seems suspect. + if (NumElts == 2) + return true; + if (NumElts == 4 && VT.getSizeInBits() == 128) { + return (isMOVLMask(Mask, VT) || + isCommutedMOVLMask(Mask, VT, true) || + isSHUFPMask(Mask, VT) || + isCommutedSHUFPMask(Mask, VT)); + } + return false; +} + +//===----------------------------------------------------------------------===// +// X86 Scheduler Hooks +//===----------------------------------------------------------------------===// + +// private utility function +MachineBasicBlock * +X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, + MachineBasicBlock *MBB, + unsigned regOpc, + unsigned immOpc, + unsigned LoadOpc, + unsigned CXchgOpc, + unsigned notOpc, + unsigned EAXreg, + TargetRegisterClass *RC, + bool invSrc) const { + // For the atomic bitwise operator, we generate + // thisMBB: + // newMBB: + // ld t1 = [bitinstr.addr] + // op t2 = t1, [bitinstr.val] + // mov EAX = t1 + // lcs dest = [bitinstr.addr], t2 [EAX is implicit] + // bz newMBB + // fallthrough -->nextMBB + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + MachineFunction::iterator MBBIter = MBB; + ++MBBIter; + + /// First build the CFG + MachineFunction *F = MBB->getParent(); + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(MBBIter, newMBB); + F->insert(MBBIter, nextMBB); + + // Transfer the remainder of thisMBB and its successor edges to nextMBB. + nextMBB->splice(nextMBB->begin(), thisMBB, + llvm::next(MachineBasicBlock::iterator(bInstr)), + thisMBB->end()); + nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); + + // Update thisMBB to fall through to newMBB + thisMBB->addSuccessor(newMBB); + + // newMBB jumps to itself and fall through to nextMBB + newMBB->addSuccessor(nextMBB); + newMBB->addSuccessor(newMBB); + + // Insert instructions into newMBB based on incoming instruction + assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && + "unexpected number of operands"); + DebugLoc dl = bInstr->getDebugLoc(); + MachineOperand& destOper = bInstr->getOperand(0); + MachineOperand* argOpers[2 + X86::AddrNumOperands]; + int numArgs = bInstr->getNumOperands() - 1; + for (int i=0; i < numArgs; ++i) + argOpers[i] = &bInstr->getOperand(i+1); + + // x86 address has 4 operands: base, index, scale, and displacement + int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] + int valArgIndx = lastAddrIndx + 1; + + unsigned t1 = F->getRegInfo().createVirtualRegister(RC); + MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); + for (int i=0; i <= lastAddrIndx; ++i) + (*MIB).addOperand(*argOpers[i]); + + unsigned tt = F->getRegInfo().createVirtualRegister(RC); + if (invSrc) { + MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); + } + else + tt = t1; + + unsigned t2 = F->getRegInfo().createVirtualRegister(RC); + assert((argOpers[valArgIndx]->isReg() || + argOpers[valArgIndx]->isImm()) && + "invalid operand"); + if (argOpers[valArgIndx]->isReg()) + MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); + else + MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); + MIB.addReg(tt); + (*MIB).addOperand(*argOpers[valArgIndx]); + + MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); + MIB.addReg(t1); + + MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); + for (int i=0; i <= lastAddrIndx; ++i) + (*MIB).addOperand(*argOpers[i]); + MIB.addReg(t2); + assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); + (*MIB).setMemRefs(bInstr->memoperands_begin(), + bInstr->memoperands_end()); + + MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); + MIB.addReg(EAXreg); + + // insert branch + BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); + + bInstr->eraseFromParent(); // The pseudo instruction is gone now. + return nextMBB; +} + +// private utility function: 64 bit atomics on 32 bit host. +MachineBasicBlock * +X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, + MachineBasicBlock *MBB, + unsigned regOpcL, + unsigned regOpcH, + unsigned immOpcL, + unsigned immOpcH, + bool invSrc) const { + // For the atomic bitwise operator, we generate + // thisMBB (instructions are in pairs, except cmpxchg8b) + // ld t1,t2 = [bitinstr.addr] + // newMBB: + // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) + // op t5, t6 <- out1, out2, [bitinstr.val] + // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) + // mov ECX, EBX <- t5, t6 + // mov EAX, EDX <- t1, t2 + // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] + // mov t3, t4 <- EAX, EDX + // bz newMBB + // result in out1, out2 + // fallthrough -->nextMBB + + const TargetRegisterClass *RC = X86::GR32RegisterClass; + const unsigned LoadOpc = X86::MOV32rm; + const unsigned NotOpc = X86::NOT32r; + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + MachineFunction::iterator MBBIter = MBB; + ++MBBIter; + + /// First build the CFG + MachineFunction *F = MBB->getParent(); + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(MBBIter, newMBB); + F->insert(MBBIter, nextMBB); + + // Transfer the remainder of thisMBB and its successor edges to nextMBB. + nextMBB->splice(nextMBB->begin(), thisMBB, + llvm::next(MachineBasicBlock::iterator(bInstr)), + thisMBB->end()); + nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); + + // Update thisMBB to fall through to newMBB + thisMBB->addSuccessor(newMBB); + + // newMBB jumps to itself and fall through to nextMBB + newMBB->addSuccessor(nextMBB); + newMBB->addSuccessor(newMBB); + + DebugLoc dl = bInstr->getDebugLoc(); + // Insert instructions into newMBB based on incoming instruction + // There are 8 "real" operands plus 9 implicit def/uses, ignored here. + assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && + "unexpected number of operands"); + MachineOperand& dest1Oper = bInstr->getOperand(0); + MachineOperand& dest2Oper = bInstr->getOperand(1); + MachineOperand* argOpers[2 + X86::AddrNumOperands]; + for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { + argOpers[i] = &bInstr->getOperand(i+2); + + // We use some of the operands multiple times, so conservatively just + // clear any kill flags that might be present. + if (argOpers[i]->isReg() && argOpers[i]->isUse()) + argOpers[i]->setIsKill(false); + } + + // x86 address has 5 operands: base, index, scale, displacement, and segment. + int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] + + unsigned t1 = F->getRegInfo().createVirtualRegister(RC); + MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); + for (int i=0; i <= lastAddrIndx; ++i) + (*MIB).addOperand(*argOpers[i]); + unsigned t2 = F->getRegInfo().createVirtualRegister(RC); + MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); + // add 4 to displacement. + for (int i=0; i <= lastAddrIndx-2; ++i) + (*MIB).addOperand(*argOpers[i]); + MachineOperand newOp3 = *(argOpers[3]); + if (newOp3.isImm()) + newOp3.setImm(newOp3.getImm()+4); + else + newOp3.setOffset(newOp3.getOffset()+4); + (*MIB).addOperand(newOp3); + (*MIB).addOperand(*argOpers[lastAddrIndx]); + + // t3/4 are defined later, at the bottom of the loop + unsigned t3 = F->getRegInfo().createVirtualRegister(RC); + unsigned t4 = F->getRegInfo().createVirtualRegister(RC); + BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) + .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); + BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) + .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); + + // The subsequent operations should be using the destination registers of + //the PHI instructions. + if (invSrc) { + t1 = F->getRegInfo().createVirtualRegister(RC); + t2 = F->getRegInfo().createVirtualRegister(RC); + MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); + MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); + } else { + t1 = dest1Oper.getReg(); + t2 = dest2Oper.getReg(); + } + + int valArgIndx = lastAddrIndx + 1; + assert((argOpers[valArgIndx]->isReg() || + argOpers[valArgIndx]->isImm()) && + "invalid operand"); + unsigned t5 = F->getRegInfo().createVirtualRegister(RC); + unsigned t6 = F->getRegInfo().createVirtualRegister(RC); + if (argOpers[valArgIndx]->isReg()) + MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); + else + MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); + if (regOpcL != X86::MOV32rr) + MIB.addReg(t1); + (*MIB).addOperand(*argOpers[valArgIndx]); + assert(argOpers[valArgIndx + 1]->isReg() == + argOpers[valArgIndx]->isReg()); + assert(argOpers[valArgIndx + 1]->isImm() == + argOpers[valArgIndx]->isImm()); + if (argOpers[valArgIndx + 1]->isReg()) + MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); + else + MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); + if (regOpcH != X86::MOV32rr) + MIB.addReg(t2); + (*MIB).addOperand(*argOpers[valArgIndx + 1]); + + MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); + MIB.addReg(t1); + MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); + MIB.addReg(t2); + + MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); + MIB.addReg(t5); + MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); + MIB.addReg(t6); + + MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); + for (int i=0; i <= lastAddrIndx; ++i) + (*MIB).addOperand(*argOpers[i]); + + assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); + (*MIB).setMemRefs(bInstr->memoperands_begin(), + bInstr->memoperands_end()); + + MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); + MIB.addReg(X86::EAX); + MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); + MIB.addReg(X86::EDX); + + // insert branch + BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); + + bInstr->eraseFromParent(); // The pseudo instruction is gone now. + return nextMBB; +} + +// private utility function +MachineBasicBlock * +X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, + MachineBasicBlock *MBB, + unsigned cmovOpc) const { + // For the atomic min/max operator, we generate + // thisMBB: + // newMBB: + // ld t1 = [min/max.addr] + // mov t2 = [min/max.val] + // cmp t1, t2 + // cmov[cond] t2 = t1 + // mov EAX = t1 + // lcs dest = [bitinstr.addr], t2 [EAX is implicit] + // bz newMBB + // fallthrough -->nextMBB + // + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + MachineFunction::iterator MBBIter = MBB; + ++MBBIter; + + /// First build the CFG + MachineFunction *F = MBB->getParent(); + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(MBBIter, newMBB); + F->insert(MBBIter, nextMBB); + + // Transfer the remainder of thisMBB and its successor edges to nextMBB. + nextMBB->splice(nextMBB->begin(), thisMBB, + llvm::next(MachineBasicBlock::iterator(mInstr)), + thisMBB->end()); + nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); + + // Update thisMBB to fall through to newMBB + thisMBB->addSuccessor(newMBB); + + // newMBB jumps to newMBB and fall through to nextMBB + newMBB->addSuccessor(nextMBB); + newMBB->addSuccessor(newMBB); + + DebugLoc dl = mInstr->getDebugLoc(); + // Insert instructions into newMBB based on incoming instruction + assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && + "unexpected number of operands"); + MachineOperand& destOper = mInstr->getOperand(0); + MachineOperand* argOpers[2 + X86::AddrNumOperands]; + int numArgs = mInstr->getNumOperands() - 1; + for (int i=0; i < numArgs; ++i) + argOpers[i] = &mInstr->getOperand(i+1); + + // x86 address has 4 operands: base, index, scale, and displacement + int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] + int valArgIndx = lastAddrIndx + 1; + + unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); + MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); + for (int i=0; i <= lastAddrIndx; ++i) + (*MIB).addOperand(*argOpers[i]); + + // We only support register and immediate values + assert((argOpers[valArgIndx]->isReg() || + argOpers[valArgIndx]->isImm()) && + "invalid operand"); + + unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); + if (argOpers[valArgIndx]->isReg()) + MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); + else + MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); + (*MIB).addOperand(*argOpers[valArgIndx]); + + MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); + MIB.addReg(t1); + + MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); + MIB.addReg(t1); + MIB.addReg(t2); + + // Generate movc + unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); + MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); + MIB.addReg(t2); + MIB.addReg(t1); + + // Cmp and exchange if none has modified the memory location + MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); + for (int i=0; i <= lastAddrIndx; ++i) + (*MIB).addOperand(*argOpers[i]); + MIB.addReg(t3); + assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); + (*MIB).setMemRefs(mInstr->memoperands_begin(), + mInstr->memoperands_end()); + + MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); + MIB.addReg(X86::EAX); + + // insert branch + BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); + + mInstr->eraseFromParent(); // The pseudo instruction is gone now. + return nextMBB; +} + +// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 +// or XMM0_V32I8 in AVX all of this code can be replaced with that +// in the .td file. +MachineBasicBlock * +X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, + unsigned numArgs, bool memArg) const { + assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && + "Target must have SSE4.2 or AVX features enabled"); + + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + unsigned Opc; + if (!Subtarget->hasAVX()) { + if (memArg) + Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; + else + Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; + } else { + if (memArg) + Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; + else + Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; + } + + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); + for (unsigned i = 0; i < numArgs; ++i) { + MachineOperand &Op = MI->getOperand(i+1); + if (!(Op.isReg() && Op.isImplicit())) + MIB.addOperand(Op); + } + BuildMI(*BB, MI, dl, + TII->get(Subtarget->hasAVX() ? X86::VMOVAPSrr : X86::MOVAPSrr), + MI->getOperand(0).getReg()) + .addReg(X86::XMM0); + + MI->eraseFromParent(); + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + // Address into RAX/EAX, other two args into ECX, EDX. + unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; + unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg); + for (int i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(i)); + + unsigned ValOps = X86::AddrNumOperands; + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) + .addReg(MI->getOperand(ValOps).getReg()); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX) + .addReg(MI->getOperand(ValOps+1).getReg()); + + // The instruction doesn't actually take any operands though. + BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const { + DebugLoc dl = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + // First arg in ECX, the second in EAX. + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX) + .addReg(MI->getOperand(0).getReg()); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX) + .addReg(MI->getOperand(1).getReg()); + + // The instruction doesn't actually take any operands though. + BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr)); + + MI->eraseFromParent(); // The pseudo is gone now. + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitVAARG64WithCustomInserter( + MachineInstr *MI, + MachineBasicBlock *MBB) const { + // Emit va_arg instruction on X86-64. + + // Operands to this pseudo-instruction: + // 0 ) Output : destination address (reg) + // 1-5) Input : va_list address (addr, i64mem) + // 6 ) ArgSize : Size (in bytes) of vararg type + // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset + // 8 ) Align : Alignment of type + // 9 ) EFLAGS (implicit-def) + + assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); + assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); + + unsigned DestReg = MI->getOperand(0).getReg(); + MachineOperand &Base = MI->getOperand(1); + MachineOperand &Scale = MI->getOperand(2); + MachineOperand &Index = MI->getOperand(3); + MachineOperand &Disp = MI->getOperand(4); + MachineOperand &Segment = MI->getOperand(5); + unsigned ArgSize = MI->getOperand(6).getImm(); + unsigned ArgMode = MI->getOperand(7).getImm(); + unsigned Align = MI->getOperand(8).getImm(); + + // Memory Reference + assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + // Machine Information + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); + const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); + DebugLoc DL = MI->getDebugLoc(); + + // struct va_list { + // i32 gp_offset + // i32 fp_offset + // i64 overflow_area (address) + // i64 reg_save_area (address) + // } + // sizeof(va_list) = 24 + // alignment(va_list) = 8 + + unsigned TotalNumIntRegs = 6; + unsigned TotalNumXMMRegs = 8; + bool UseGPOffset = (ArgMode == 1); + bool UseFPOffset = (ArgMode == 2); + unsigned MaxOffset = TotalNumIntRegs * 8 + + (UseFPOffset ? TotalNumXMMRegs * 16 : 0); + + /* Align ArgSize to a multiple of 8 */ + unsigned ArgSizeA8 = (ArgSize + 7) & ~7; + bool NeedsAlign = (Align > 8); + + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *overflowMBB; + MachineBasicBlock *offsetMBB; + MachineBasicBlock *endMBB; + + unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB + unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB + unsigned OffsetReg = 0; + + if (!UseGPOffset && !UseFPOffset) { + // If we only pull from the overflow region, we don't create a branch. + // We don't need to alter control flow. + OffsetDestReg = 0; // unused + OverflowDestReg = DestReg; + + offsetMBB = NULL; + overflowMBB = thisMBB; + endMBB = thisMBB; + } else { + // First emit code to check if gp_offset (or fp_offset) is below the bound. + // If so, pull the argument from reg_save_area. (branch to offsetMBB) + // If not, pull from overflow_area. (branch to overflowMBB) + // + // thisMBB + // | . + // | . + // offsetMBB overflowMBB + // | . + // | . + // endMBB + + // Registers for the PHI in endMBB + OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); + OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); + + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + MachineFunction *MF = MBB->getParent(); + overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); + offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); + endMBB = MF->CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = MBB; + ++MBBIter; + + // Insert the new basic blocks + MF->insert(MBBIter, offsetMBB); + MF->insert(MBBIter, overflowMBB); + MF->insert(MBBIter, endMBB); + + // Transfer the remainder of MBB and its successor edges to endMBB. + endMBB->splice(endMBB->begin(), thisMBB, + llvm::next(MachineBasicBlock::iterator(MI)), + thisMBB->end()); + endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); + + // Make offsetMBB and overflowMBB successors of thisMBB + thisMBB->addSuccessor(offsetMBB); + thisMBB->addSuccessor(overflowMBB); + + // endMBB is a successor of both offsetMBB and overflowMBB + offsetMBB->addSuccessor(endMBB); + overflowMBB->addSuccessor(endMBB); + + // Load the offset value into a register + OffsetReg = MRI.createVirtualRegister(OffsetRegClass); + BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, UseFPOffset ? 4 : 0) + .addOperand(Segment) + .setMemRefs(MMOBegin, MMOEnd); + + // Check if there is enough room left to pull this argument. + BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) + .addReg(OffsetReg) + .addImm(MaxOffset + 8 - ArgSizeA8); + + // Branch to "overflowMBB" if offset >= max + // Fall through to "offsetMBB" otherwise + BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) + .addMBB(overflowMBB); + } + + // In offsetMBB, emit code to use the reg_save_area. + if (offsetMBB) { + assert(OffsetReg != 0); + + // Read the reg_save_area address. + unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); + BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, 16) + .addOperand(Segment) + .setMemRefs(MMOBegin, MMOEnd); + + // Zero-extend the offset + unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); + BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) + .addImm(0) + .addReg(OffsetReg) + .addImm(X86::sub_32bit); + + // Add the offset to the reg_save_area to get the final address. + BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) + .addReg(OffsetReg64) + .addReg(RegSaveReg); + + // Compute the offset for the next argument + unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); + BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) + .addReg(OffsetReg) + .addImm(UseFPOffset ? 16 : 8); + + // Store it back into the va_list. + BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, UseFPOffset ? 4 : 0) + .addOperand(Segment) + .addReg(NextOffsetReg) + .setMemRefs(MMOBegin, MMOEnd); + + // Jump to endMBB + BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) + .addMBB(endMBB); + } + + // + // Emit code to use overflow area + // + + // Load the overflow_area address into a register. + unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); + BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, 8) + .addOperand(Segment) + .setMemRefs(MMOBegin, MMOEnd); + + // If we need to align it, do so. Otherwise, just copy the address + // to OverflowDestReg. + if (NeedsAlign) { + // Align the overflow address + assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); + unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); + + // aligned_addr = (addr + (align-1)) & ~(align-1) + BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) + .addReg(OverflowAddrReg) + .addImm(Align-1); + + BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) + .addReg(TmpReg) + .addImm(~(uint64_t)(Align-1)); + } else { + BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) + .addReg(OverflowAddrReg); + } + + // Compute the next overflow address after this argument. + // (the overflow address should be kept 8-byte aligned) + unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); + BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) + .addReg(OverflowDestReg) + .addImm(ArgSizeA8); + + // Store the new overflow address. + BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) + .addOperand(Base) + .addOperand(Scale) + .addOperand(Index) + .addDisp(Disp, 8) + .addOperand(Segment) + .addReg(NextAddrReg) + .setMemRefs(MMOBegin, MMOEnd); + + // If we branched, emit the PHI to the front of endMBB. + if (offsetMBB) { + BuildMI(*endMBB, endMBB->begin(), DL, + TII->get(X86::PHI), DestReg) + .addReg(OffsetDestReg).addMBB(offsetMBB) + .addReg(OverflowDestReg).addMBB(overflowMBB); + } + + // Erase the pseudo instruction + MI->eraseFromParent(); + + return endMBB; +} + +MachineBasicBlock * +X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( + MachineInstr *MI, + MachineBasicBlock *MBB) const { + // Emit code to save XMM registers to the stack. The ABI says that the + // number of registers to save is given in %al, so it's theoretically + // possible to do an indirect jump trick to avoid saving all of them, + // however this code takes a simpler approach and just executes all + // of the stores if %al is non-zero. It's less code, and it's probably + // easier on the hardware branch predictor, and stores aren't all that + // expensive anyway. + + // Create the new basic blocks. One block contains all the XMM stores, + // and one block is the final destination regardless of whether any + // stores were performed. + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + MachineFunction *F = MBB->getParent(); + MachineFunction::iterator MBBIter = MBB; + ++MBBIter; + MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(MBBIter, XMMSaveMBB); + F->insert(MBBIter, EndMBB); + + // Transfer the remainder of MBB and its successor edges to EndMBB. + EndMBB->splice(EndMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), + MBB->end()); + EndMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // The original block will now fall through to the XMM save block. + MBB->addSuccessor(XMMSaveMBB); + // The XMMSaveMBB will fall through to the end block. + XMMSaveMBB->addSuccessor(EndMBB); + + // Now add the instructions. + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + unsigned CountReg = MI->getOperand(0).getReg(); + int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); + int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); + + if (!Subtarget->isTargetWin64()) { + // If %al is 0, branch around the XMM save block. + BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); + BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); + MBB->addSuccessor(EndMBB); + } + + unsigned MOVOpc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; + // In the XMM save block, save all the XMM argument registers. + for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { + int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; + MachineMemOperand *MMO = + F->getMachineMemOperand( + MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), + MachineMemOperand::MOStore, + /*Size=*/16, /*Align=*/16); + BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) + .addFrameIndex(RegSaveFrameIndex) + .addImm(/*Scale=*/1) + .addReg(/*IndexReg=*/0) + .addImm(/*Disp=*/Offset) + .addReg(/*Segment=*/0) + .addReg(MI->getOperand(i).getReg()) + .addMemOperand(MMO); + } + + MI->eraseFromParent(); // The pseudo instruction is gone now. + + return EndMBB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + // To "insert" a SELECT_CC instruction, we actually have to insert the + // diamond control-flow pattern. The incoming instruction knows the + // destination vreg to set, the condition code register to branch on, the + // true/false values to select between, and a branch opcode to use. + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + MachineFunction::iterator It = BB; + ++It; + + // thisMBB: + // ... + // TrueVal = ... + // cmpTY ccX, r1, r2 + // bCC copy1MBB + // fallthrough --> copy0MBB + MachineBasicBlock *thisMBB = BB; + MachineFunction *F = BB->getParent(); + MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); + F->insert(It, copy0MBB); + F->insert(It, sinkMBB); + + // If the EFLAGS register isn't dead in the terminator, then claim that it's + // live into the sink and copy blocks. + if (!MI->killsRegister(X86::EFLAGS)) { + copy0MBB->addLiveIn(X86::EFLAGS); + sinkMBB->addLiveIn(X86::EFLAGS); + } + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), BB, + llvm::next(MachineBasicBlock::iterator(MI)), + BB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(BB); + + // Add the true and fallthrough blocks as its successors. + BB->addSuccessor(copy0MBB); + BB->addSuccessor(sinkMBB); + + // Create the conditional branch instruction. + unsigned Opc = + X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); + BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); + + // copy0MBB: + // %FalseValue = ... + // # fallthrough to sinkMBB + copy0MBB->addSuccessor(sinkMBB); + + // sinkMBB: + // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] + // ... + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(X86::PHI), MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) + .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return sinkMBB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, + bool Is64Bit) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *MF = BB->getParent(); + const BasicBlock *LLVM_BB = BB->getBasicBlock(); + + assert(EnableSegmentedStacks); + + unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; + unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; + + // BB: + // ... [Till the alloca] + // If stacklet is not large enough, jump to mallocMBB + // + // bumpMBB: + // Allocate by subtracting from RSP + // Jump to continueMBB + // + // mallocMBB: + // Allocate by call to runtime + // + // continueMBB: + // ... + // [rest of original BB] + // + + MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterClass *AddrRegClass = + getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); + + unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), + bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), + tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), + sizeVReg = MI->getOperand(1).getReg(), + physSPReg = Is64Bit ? X86::RSP : X86::ESP; + + MachineFunction::iterator MBBIter = BB; + ++MBBIter; + + MF->insert(MBBIter, bumpMBB); + MF->insert(MBBIter, mallocMBB); + MF->insert(MBBIter, continueMBB); + + continueMBB->splice(continueMBB->begin(), BB, llvm::next + (MachineBasicBlock::iterator(MI)), BB->end()); + continueMBB->transferSuccessorsAndUpdatePHIs(BB); + + // Add code to the main basic block to check if the stack limit has been hit, + // and if so, jump to mallocMBB otherwise to bumpMBB. + BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); + BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), tmpSPVReg) + .addReg(tmpSPVReg).addReg(sizeVReg); + BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) + .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg) + .addReg(tmpSPVReg); + BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); + + // bumpMBB simply decreases the stack pointer, since we know the current + // stacklet has enough space. + BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) + .addReg(tmpSPVReg); + BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) + .addReg(tmpSPVReg); + BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); + + // Calls into a routine in libgcc to allocate more space from the heap. + if (Is64Bit) { + BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) + .addReg(sizeVReg); + BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI); + } else { + BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) + .addImm(12); + BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); + BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) + .addExternalSymbol("__morestack_allocate_stack_space"); + } + + if (!Is64Bit) + BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) + .addImm(16); + + BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) + .addReg(Is64Bit ? X86::RAX : X86::EAX); + BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); + + // Set up the CFG correctly. + BB->addSuccessor(bumpMBB); + BB->addSuccessor(mallocMBB); + mallocMBB->addSuccessor(continueMBB); + bumpMBB->addSuccessor(continueMBB); + + // Take care of the PHI nodes. + BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), + MI->getOperand(0).getReg()) + .addReg(mallocPtrVReg).addMBB(mallocMBB) + .addReg(bumpSPPtrVReg).addMBB(bumpMBB); + + // Delete the original pseudo instruction. + MI->eraseFromParent(); + + // And we're done. + return continueMBB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + assert(!Subtarget->isTargetEnvMacho()); + + // The lowering is pretty easy: we're just emitting the call to _alloca. The + // non-trivial part is impdef of ESP. + + if (Subtarget->isTargetWin64()) { + if (Subtarget->isTargetCygMing()) { + // ___chkstk(Mingw64): + // Clobbers R10, R11, RAX and EFLAGS. + // Updates RSP. + BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) + .addExternalSymbol("___chkstk") + .addReg(X86::RAX, RegState::Implicit) + .addReg(X86::RSP, RegState::Implicit) + .addReg(X86::RAX, RegState::Define | RegState::Implicit) + .addReg(X86::RSP, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + } else { + // __chkstk(MSVCRT): does not update stack pointer. + // Clobbers R10, R11 and EFLAGS. + // FIXME: RAX(allocated size) might be reused and not killed. + BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA)) + .addExternalSymbol("__chkstk") + .addReg(X86::RAX, RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + // RAX has the offset to subtracted from RSP. + BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(X86::RAX); + } + } else { + const char *StackProbeSymbol = + Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; + + BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) + .addExternalSymbol(StackProbeSymbol) + .addReg(X86::EAX, RegState::Implicit) + .addReg(X86::ESP, RegState::Implicit) + .addReg(X86::EAX, RegState::Define | RegState::Implicit) + .addReg(X86::ESP, RegState::Define | RegState::Implicit) + .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); + } + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, + MachineBasicBlock *BB) const { + // This is pretty easy. We're taking the value that we received from + // our load from the relocation, sticking it in either RDI (x86-64) + // or EAX and doing an indirect call. The return value will then + // be in the normal return register. + const X86InstrInfo *TII + = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *F = BB->getParent(); + + assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); + assert(MI->getOperand(3).isGlobal() && "This should be a global"); + + if (Subtarget->is64Bit()) { + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, + TII->get(X86::MOV64rm), X86::RDI) + .addReg(X86::RIP) + .addImm(0).addReg(0) + .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, + MI->getOperand(3).getTargetFlags()) + .addReg(0); + MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); + addDirectMem(MIB, X86::RDI); + } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, + TII->get(X86::MOV32rm), X86::EAX) + .addReg(0) + .addImm(0).addReg(0) + .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, + MI->getOperand(3).getTargetFlags()) + .addReg(0); + MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); + addDirectMem(MIB, X86::EAX); + } else { + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, + TII->get(X86::MOV32rm), X86::EAX) + .addReg(TII->getGlobalBaseReg(F)) + .addImm(0).addReg(0) + .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, + MI->getOperand(3).getTargetFlags()) + .addReg(0); + MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); + addDirectMem(MIB, X86::EAX); + } + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *BB) const { + switch (MI->getOpcode()) { + default: assert(0 && "Unexpected instr type to insert"); + case X86::TAILJMPd64: + case X86::TAILJMPr64: + case X86::TAILJMPm64: + assert(0 && "TAILJMP64 would not be touched here."); + case X86::TCRETURNdi64: + case X86::TCRETURNri64: + case X86::TCRETURNmi64: + // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset. + // On AMD64, additional defs should be added before register allocation. + if (!Subtarget->isTargetWin64()) { + MI->addRegisterDefined(X86::RSI); + MI->addRegisterDefined(X86::RDI); + MI->addRegisterDefined(X86::XMM6); + MI->addRegisterDefined(X86::XMM7); + MI->addRegisterDefined(X86::XMM8); + MI->addRegisterDefined(X86::XMM9); + MI->addRegisterDefined(X86::XMM10); + MI->addRegisterDefined(X86::XMM11); + MI->addRegisterDefined(X86::XMM12); + MI->addRegisterDefined(X86::XMM13); + MI->addRegisterDefined(X86::XMM14); + MI->addRegisterDefined(X86::XMM15); + } + return BB; + case X86::WIN_ALLOCA: + return EmitLoweredWinAlloca(MI, BB); + case X86::SEG_ALLOCA_32: + return EmitLoweredSegAlloca(MI, BB, false); + case X86::SEG_ALLOCA_64: + return EmitLoweredSegAlloca(MI, BB, true); + case X86::TLSCall_32: + case X86::TLSCall_64: + return EmitLoweredTLSCall(MI, BB); + case X86::CMOV_GR8: + case X86::CMOV_FR32: + case X86::CMOV_FR64: + case X86::CMOV_V4F32: + case X86::CMOV_V2F64: + case X86::CMOV_V2I64: + case X86::CMOV_V8F32: + case X86::CMOV_V4F64: + case X86::CMOV_V4I64: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: + return EmitLoweredSelect(MI, BB); + + case X86::FP32_TO_INT16_IN_MEM: + case X86::FP32_TO_INT32_IN_MEM: + case X86::FP32_TO_INT64_IN_MEM: + case X86::FP64_TO_INT16_IN_MEM: + case X86::FP64_TO_INT32_IN_MEM: + case X86::FP64_TO_INT64_IN_MEM: + case X86::FP80_TO_INT16_IN_MEM: + case X86::FP80_TO_INT32_IN_MEM: + case X86::FP80_TO_INT64_IN_MEM: { + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + + // Change the floating point control register to use "round towards zero" + // mode when truncating to an integer value. + MachineFunction *F = BB->getParent(); + int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); + addFrameReference(BuildMI(*BB, MI, DL, + TII->get(X86::FNSTCW16m)), CWFrameIdx); + + // Load the old value of the high byte of the control word... + unsigned OldCW = + F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), + CWFrameIdx); + + // Set the high part to be round to zero... + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) + .addImm(0xC7F); + + // Reload the modified control word now... + addFrameReference(BuildMI(*BB, MI, DL, + TII->get(X86::FLDCW16m)), CWFrameIdx); + + // Restore the memory image of control word to original value + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) + .addReg(OldCW); + + // Get the X86 opcode to use. + unsigned Opc; + switch (MI->getOpcode()) { + default: llvm_unreachable("illegal opcode!"); + case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; + case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; + case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; + case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; + case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; + case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; + case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; + case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; + case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; + } + + X86AddressMode AM; + MachineOperand &Op = MI->getOperand(0); + if (Op.isReg()) { + AM.BaseType = X86AddressMode::RegBase; + AM.Base.Reg = Op.getReg(); + } else { + AM.BaseType = X86AddressMode::FrameIndexBase; + AM.Base.FrameIndex = Op.getIndex(); + } + Op = MI->getOperand(1); + if (Op.isImm()) + AM.Scale = Op.getImm(); + Op = MI->getOperand(2); + if (Op.isImm()) + AM.IndexReg = Op.getImm(); + Op = MI->getOperand(3); + if (Op.isGlobal()) { + AM.GV = Op.getGlobal(); + } else { + AM.Disp = Op.getImm(); + } + addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) + .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); + + // Reload the original control word now. + addFrameReference(BuildMI(*BB, MI, DL, + TII->get(X86::FLDCW16m)), CWFrameIdx); + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; + } + // String/text processing lowering. + case X86::PCMPISTRM128REG: + case X86::VPCMPISTRM128REG: + return EmitPCMP(MI, BB, 3, false /* in-mem */); + case X86::PCMPISTRM128MEM: + case X86::VPCMPISTRM128MEM: + return EmitPCMP(MI, BB, 3, true /* in-mem */); + case X86::PCMPESTRM128REG: + case X86::VPCMPESTRM128REG: + return EmitPCMP(MI, BB, 5, false /* in mem */); + case X86::PCMPESTRM128MEM: + case X86::VPCMPESTRM128MEM: + return EmitPCMP(MI, BB, 5, true /* in mem */); + + // Thread synchronization. + case X86::MONITOR: + return EmitMonitor(MI, BB); + case X86::MWAIT: + return EmitMwait(MI, BB); + + // Atomic Lowering. + case X86::ATOMAND32: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, + X86::AND32ri, X86::MOV32rm, + X86::LCMPXCHG32, + X86::NOT32r, X86::EAX, + X86::GR32RegisterClass); + case X86::ATOMOR32: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, + X86::OR32ri, X86::MOV32rm, + X86::LCMPXCHG32, + X86::NOT32r, X86::EAX, + X86::GR32RegisterClass); + case X86::ATOMXOR32: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, + X86::XOR32ri, X86::MOV32rm, + X86::LCMPXCHG32, + X86::NOT32r, X86::EAX, + X86::GR32RegisterClass); + case X86::ATOMNAND32: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, + X86::AND32ri, X86::MOV32rm, + X86::LCMPXCHG32, + X86::NOT32r, X86::EAX, + X86::GR32RegisterClass, true); + case X86::ATOMMIN32: + return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); + case X86::ATOMMAX32: + return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); + case X86::ATOMUMIN32: + return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); + case X86::ATOMUMAX32: + return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); + + case X86::ATOMAND16: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, + X86::AND16ri, X86::MOV16rm, + X86::LCMPXCHG16, + X86::NOT16r, X86::AX, + X86::GR16RegisterClass); + case X86::ATOMOR16: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, + X86::OR16ri, X86::MOV16rm, + X86::LCMPXCHG16, + X86::NOT16r, X86::AX, + X86::GR16RegisterClass); + case X86::ATOMXOR16: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, + X86::XOR16ri, X86::MOV16rm, + X86::LCMPXCHG16, + X86::NOT16r, X86::AX, + X86::GR16RegisterClass); + case X86::ATOMNAND16: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, + X86::AND16ri, X86::MOV16rm, + X86::LCMPXCHG16, + X86::NOT16r, X86::AX, + X86::GR16RegisterClass, true); + case X86::ATOMMIN16: + return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); + case X86::ATOMMAX16: + return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); + case X86::ATOMUMIN16: + return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); + case X86::ATOMUMAX16: + return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); + + case X86::ATOMAND8: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, + X86::AND8ri, X86::MOV8rm, + X86::LCMPXCHG8, + X86::NOT8r, X86::AL, + X86::GR8RegisterClass); + case X86::ATOMOR8: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, + X86::OR8ri, X86::MOV8rm, + X86::LCMPXCHG8, + X86::NOT8r, X86::AL, + X86::GR8RegisterClass); + case X86::ATOMXOR8: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, + X86::XOR8ri, X86::MOV8rm, + X86::LCMPXCHG8, + X86::NOT8r, X86::AL, + X86::GR8RegisterClass); + case X86::ATOMNAND8: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, + X86::AND8ri, X86::MOV8rm, + X86::LCMPXCHG8, + X86::NOT8r, X86::AL, + X86::GR8RegisterClass, true); + // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. + // This group is for 64-bit host. + case X86::ATOMAND64: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, + X86::AND64ri32, X86::MOV64rm, + X86::LCMPXCHG64, + X86::NOT64r, X86::RAX, + X86::GR64RegisterClass); + case X86::ATOMOR64: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, + X86::OR64ri32, X86::MOV64rm, + X86::LCMPXCHG64, + X86::NOT64r, X86::RAX, + X86::GR64RegisterClass); + case X86::ATOMXOR64: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, + X86::XOR64ri32, X86::MOV64rm, + X86::LCMPXCHG64, + X86::NOT64r, X86::RAX, + X86::GR64RegisterClass); + case X86::ATOMNAND64: + return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, + X86::AND64ri32, X86::MOV64rm, + X86::LCMPXCHG64, + X86::NOT64r, X86::RAX, + X86::GR64RegisterClass, true); + case X86::ATOMMIN64: + return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); + case X86::ATOMMAX64: + return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); + case X86::ATOMUMIN64: + return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); + case X86::ATOMUMAX64: + return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); + + // This group does 64-bit operations on a 32-bit host. + case X86::ATOMAND6432: + return EmitAtomicBit6432WithCustomInserter(MI, BB, + X86::AND32rr, X86::AND32rr, + X86::AND32ri, X86::AND32ri, + false); + case X86::ATOMOR6432: + return EmitAtomicBit6432WithCustomInserter(MI, BB, + X86::OR32rr, X86::OR32rr, + X86::OR32ri, X86::OR32ri, + false); + case X86::ATOMXOR6432: + return EmitAtomicBit6432WithCustomInserter(MI, BB, + X86::XOR32rr, X86::XOR32rr, + X86::XOR32ri, X86::XOR32ri, + false); + case X86::ATOMNAND6432: + return EmitAtomicBit6432WithCustomInserter(MI, BB, + X86::AND32rr, X86::AND32rr, + X86::AND32ri, X86::AND32ri, + true); + case X86::ATOMADD6432: + return EmitAtomicBit6432WithCustomInserter(MI, BB, + X86::ADD32rr, X86::ADC32rr, + X86::ADD32ri, X86::ADC32ri, + false); + case X86::ATOMSUB6432: + return EmitAtomicBit6432WithCustomInserter(MI, BB, + X86::SUB32rr, X86::SBB32rr, + X86::SUB32ri, X86::SBB32ri, + false); + case X86::ATOMSWAP6432: + return EmitAtomicBit6432WithCustomInserter(MI, BB, + X86::MOV32rr, X86::MOV32rr, + X86::MOV32ri, X86::MOV32ri, + false); + case X86::VASTART_SAVE_XMM_REGS: + return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); + + case X86::VAARG_64: + return EmitVAARG64WithCustomInserter(MI, BB); + } +} + +//===----------------------------------------------------------------------===// +// X86 Optimization Hooks +//===----------------------------------------------------------------------===// + +void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, + const APInt &Mask, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const { + unsigned Opc = Op.getOpcode(); + assert((Opc >= ISD::BUILTIN_OP_END || + Opc == ISD::INTRINSIC_WO_CHAIN || + Opc == ISD::INTRINSIC_W_CHAIN || + Opc == ISD::INTRINSIC_VOID) && + "Should use MaskedValueIsZero if you don't know whether Op" + " is a target node!"); + + KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. + switch (Opc) { + default: break; + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::ADC: + case X86ISD::SBB: + case X86ISD::SMUL: + case X86ISD::UMUL: + case X86ISD::INC: + case X86ISD::DEC: + case X86ISD::OR: + case X86ISD::XOR: + case X86ISD::AND: + // These nodes' second result is a boolean. + if (Op.getResNo() == 0) + break; + // Fallthrough + case X86ISD::SETCC: + KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), + Mask.getBitWidth() - 1); + break; + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned NumLoBits = 0; + switch (IntId) { + default: break; + case Intrinsic::x86_sse_movmsk_ps: + case Intrinsic::x86_avx_movmsk_ps_256: + case Intrinsic::x86_sse2_movmsk_pd: + case Intrinsic::x86_avx_movmsk_pd_256: + case Intrinsic::x86_mmx_pmovmskb: + case Intrinsic::x86_sse2_pmovmskb_128: { + // High bits of movmskp{s|d}, pmovmskb are known zero. + switch (IntId) { + case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; + case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; + case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; + case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; + case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; + case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; + } + KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(), + Mask.getBitWidth() - NumLoBits); + break; + } + } + break; + } + } +} + +unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, + unsigned Depth) const { + // SETCC_CARRY sets the dest to ~0 for true or 0 for false. + if (Op.getOpcode() == X86ISD::SETCC_CARRY) + return Op.getValueType().getScalarType().getSizeInBits(); + + // Fallback case. + return 1; +} + +/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the +/// node is a GlobalAddress + offset. +bool X86TargetLowering::isGAPlusOffset(SDNode *N, + const GlobalValue* &GA, + int64_t &Offset) const { + if (N->getOpcode() == X86ISD::Wrapper) { + if (isa<GlobalAddressSDNode>(N->getOperand(0))) { + GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); + Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); + return true; + } + } + return TargetLowering::isGAPlusOffset(N, GA, Offset); +} + +/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the +/// same as extracting the high 128-bit part of 256-bit vector and then +/// inserting the result into the low part of a new 256-bit vector +static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { + EVT VT = SVOp->getValueType(0); + int NumElems = VT.getVectorNumElements(); + + // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j) + if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || + SVOp->getMaskElt(j) >= 0) + return false; + + return true; +} + +/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the +/// same as extracting the low 128-bit part of 256-bit vector and then +/// inserting the result into the high part of a new 256-bit vector +static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { + EVT VT = SVOp->getValueType(0); + int NumElems = VT.getVectorNumElements(); + + // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> + for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j) + if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || + SVOp->getMaskElt(j) >= 0) + return false; + + return true; +} + +/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. +static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + DebugLoc dl = N->getDebugLoc(); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + EVT VT = SVOp->getValueType(0); + int NumElems = VT.getVectorNumElements(); + + if (V1.getOpcode() == ISD::CONCAT_VECTORS && + V2.getOpcode() == ISD::CONCAT_VECTORS) { + // + // 0,0,0,... + // | + // V UNDEF BUILD_VECTOR UNDEF + // \ / \ / + // CONCAT_VECTOR CONCAT_VECTOR + // \ / + // \ / + // RESULT: V + zero extended + // + if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR || + V2.getOperand(1).getOpcode() != ISD::UNDEF || + V1.getOperand(1).getOpcode() != ISD::UNDEF) + return SDValue(); + + if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode())) + return SDValue(); + + // To match the shuffle mask, the first half of the mask should + // be exactly the first vector, and all the rest a splat with the + // first element of the second one. + for (int i = 0; i < NumElems/2; ++i) + if (!isUndefOrEqual(SVOp->getMaskElt(i), i) || + !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems)) + return SDValue(); + + // Emit a zeroed vector and insert the desired subvector on its + // first half. + SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl); + SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), + DAG.getConstant(0, MVT::i32), DAG, dl); + return DCI.CombineTo(N, InsV); + } + + //===--------------------------------------------------------------------===// + // Combine some shuffles into subvector extracts and inserts: + // + + // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + if (isShuffleHigh128VectorInsertLow(SVOp)) { + SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32), + DAG, dl); + SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), + V, DAG.getConstant(0, MVT::i32), DAG, dl); + return DCI.CombineTo(N, InsV); + } + + // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> + if (isShuffleLow128VectorInsertHigh(SVOp)) { + SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl); + SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), + V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl); + return DCI.CombineTo(N, InsV); + } + + return SDValue(); +} + +/// PerformShuffleCombine - Performs several different shuffle combines. +static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + + // Don't create instructions with illegal types after legalize types has run. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) + return SDValue(); + + // Combine 256-bit vector shuffles. This is only profitable when in AVX mode + if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 && + N->getOpcode() == ISD::VECTOR_SHUFFLE) + return PerformShuffleCombine256(N, DAG, DCI); + + // Only handle 128 wide vector from here on. + if (VT.getSizeInBits() != 128) + return SDValue(); + + // Combine a vector_shuffle that is equal to build_vector load1, load2, load3, + // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are + // consecutive, non-overlapping, and in the right order. + SmallVector<SDValue, 16> Elts; + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) + Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); + + return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); +} + +/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index +/// generation and convert it from being a bunch of shuffles and extracts +/// to a simple store and scalar loads to extract the elements. +static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, + const TargetLowering &TLI) { + SDValue InputVector = N->getOperand(0); + + // Only operate on vectors of 4 elements, where the alternative shuffling + // gets to be more expensive. + if (InputVector.getValueType() != MVT::v4i32) + return SDValue(); + + // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a + // single use which is a sign-extend or zero-extend, and all elements are + // used. + SmallVector<SDNode *, 4> Uses; + unsigned ExtractedElements = 0; + for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), + UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { + if (UI.getUse().getResNo() != InputVector.getResNo()) + return SDValue(); + + SDNode *Extract = *UI; + if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + if (Extract->getValueType(0) != MVT::i32) + return SDValue(); + if (!Extract->hasOneUse()) + return SDValue(); + if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && + Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + if (!isa<ConstantSDNode>(Extract->getOperand(1))) + return SDValue(); + + // Record which element was extracted. + ExtractedElements |= + 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); + + Uses.push_back(Extract); + } + + // If not all the elements were used, this may not be worthwhile. + if (ExtractedElements != 15) + return SDValue(); + + // Ok, we've now decided to do the transformation. + DebugLoc dl = InputVector.getDebugLoc(); + + // Store the value to a temporary stack slot. + SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); + SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, + MachinePointerInfo(), false, false, 0); + + // Replace each use (extract) with a load of the appropriate element. + for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), + UE = Uses.end(); UI != UE; ++UI) { + SDNode *Extract = *UI; + + // cOMpute the element's address. + SDValue Idx = Extract->getOperand(1); + unsigned EltSize = + InputVector.getValueType().getVectorElementType().getSizeInBits()/8; + uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); + SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); + + SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), + StackPtr, OffsetVal); + + // Load the scalar. + SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, + ScalarAddr, MachinePointerInfo(), + false, false, 0); + + // Replace the exact with the load. + DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); + } + + // The replacement was made in place; don't return anything. + return SDValue(); +} + +/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT +/// nodes. +static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + DebugLoc DL = N->getDebugLoc(); + SDValue Cond = N->getOperand(0); + // Get the LHS/RHS of the select. + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + EVT VT = LHS.getValueType(); + + // If we have SSE[12] support, try to form min/max nodes. SSE min/max + // instructions match the semantics of the common C idiom x<y?x:y but not + // x<=y?x:y, because of how they handle negative zero (which can be + // ignored in unsafe-math mode). + if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && + VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && + (Subtarget->hasXMMInt() || + (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + unsigned Opcode = 0; + // Check for x CC y ? x : y. + if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && + DAG.isEqualTo(RHS, Cond.getOperand(1))) { + switch (CC) { + default: break; + case ISD::SETULT: + // Converting this to a min would handle NaNs incorrectly, and swapping + // the operands would cause it to handle comparisons between positive + // and negative zero incorrectly. + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { + if (!UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) + break; + std::swap(LHS, RHS); + } + Opcode = X86ISD::FMIN; + break; + case ISD::SETOLE: + // Converting this to a min would handle comparisons between positive + // and negative zero incorrectly. + if (!UnsafeFPMath && + !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) + break; + Opcode = X86ISD::FMIN; + break; + case ISD::SETULE: + // Converting this to a min would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. + std::swap(LHS, RHS); + case ISD::SETOLT: + case ISD::SETLT: + case ISD::SETLE: + Opcode = X86ISD::FMIN; + break; + + case ISD::SETOGE: + // Converting this to a max would handle comparisons between positive + // and negative zero incorrectly. + if (!UnsafeFPMath && + !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) + break; + Opcode = X86ISD::FMAX; + break; + case ISD::SETUGT: + // Converting this to a max would handle NaNs incorrectly, and swapping + // the operands would cause it to handle comparisons between positive + // and negative zero incorrectly. + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { + if (!UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) + break; + std::swap(LHS, RHS); + } + Opcode = X86ISD::FMAX; + break; + case ISD::SETUGE: + // Converting this to a max would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. + std::swap(LHS, RHS); + case ISD::SETOGT: + case ISD::SETGT: + case ISD::SETGE: + Opcode = X86ISD::FMAX; + break; + } + // Check for x CC y ? y : x -- a min/max with reversed arms. + } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && + DAG.isEqualTo(RHS, Cond.getOperand(0))) { + switch (CC) { + default: break; + case ISD::SETOGE: + // Converting this to a min would handle comparisons between positive + // and negative zero incorrectly, and swapping the operands would + // cause it to handle NaNs incorrectly. + if (!UnsafeFPMath && + !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) + break; + std::swap(LHS, RHS); + } + Opcode = X86ISD::FMIN; + break; + case ISD::SETUGT: + // Converting this to a min would handle NaNs incorrectly. + if (!UnsafeFPMath && + (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) + break; + Opcode = X86ISD::FMIN; + break; + case ISD::SETUGE: + // Converting this to a min would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. + std::swap(LHS, RHS); + case ISD::SETOGT: + case ISD::SETGT: + case ISD::SETGE: + Opcode = X86ISD::FMIN; + break; + + case ISD::SETULT: + // Converting this to a max would handle NaNs incorrectly. + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) + break; + Opcode = X86ISD::FMAX; + break; + case ISD::SETOLE: + // Converting this to a max would handle comparisons between positive + // and negative zero incorrectly, and swapping the operands would + // cause it to handle NaNs incorrectly. + if (!UnsafeFPMath && + !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { + if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) + break; + std::swap(LHS, RHS); + } + Opcode = X86ISD::FMAX; + break; + case ISD::SETULE: + // Converting this to a max would handle both negative zeros and NaNs + // incorrectly, but we can swap the operands to fix both. + std::swap(LHS, RHS); + case ISD::SETOLT: + case ISD::SETLT: + case ISD::SETLE: + Opcode = X86ISD::FMAX; + break; + } + } + + if (Opcode) + return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); + } + + // If this is a select between two integer constants, try to do some + // optimizations. + if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { + if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) + // Don't do this for crazy integer types. + if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { + // If this is efficiently invertible, canonicalize the LHSC/RHSC values + // so that TrueC (the true value) is larger than FalseC. + bool NeedsCondInvert = false; + + if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && + // Efficiently invertible. + (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. + (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. + isa<ConstantSDNode>(Cond.getOperand(1))))) { + NeedsCondInvert = true; + std::swap(TrueC, FalseC); + } + + // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. + if (FalseC->getAPIntValue() == 0 && + TrueC->getAPIntValue().isPowerOf2()) { + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); + + unsigned ShAmt = TrueC->getAPIntValue().logBase2(); + return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, + DAG.getConstant(ShAmt, MVT::i8)); + } + + // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. + if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, + FalseC->getValueType(0), Cond); + return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + } + + // Optimize cases that will turn into an LEA instruction. This requires + // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). + if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { + uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); + if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; + + bool isFastMultiplier = false; + if (Diff < 10) { + switch ((unsigned char)Diff) { + default: break; + case 1: // result = add base, cond + case 2: // result = lea base( , cond*2) + case 3: // result = lea base(cond, cond*2) + case 4: // result = lea base( , cond*4) + case 5: // result = lea base(cond, cond*4) + case 8: // result = lea base( , cond*8) + case 9: // result = lea base(cond, cond*8) + isFastMultiplier = true; + break; + } + } + + if (isFastMultiplier) { + APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); + if (NeedsCondInvert) // Invert the condition if needed. + Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, + DAG.getConstant(1, Cond.getValueType())); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), + Cond); + // Scale the condition by the difference. + if (Diff != 1) + Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, + DAG.getConstant(Diff, Cond.getValueType())); + + // Add the base if non-zero. + if (FalseC->getAPIntValue() != 0) + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + return Cond; + } + } + } + } + + return SDValue(); +} + +/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] +static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + DebugLoc DL = N->getDebugLoc(); + + // If the flag operand isn't dead, don't touch this CMOV. + if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) + return SDValue(); + + SDValue FalseOp = N->getOperand(0); + SDValue TrueOp = N->getOperand(1); + X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); + SDValue Cond = N->getOperand(3); + if (CC == X86::COND_E || CC == X86::COND_NE) { + switch (Cond.getOpcode()) { + default: break; + case X86ISD::BSR: + case X86ISD::BSF: + // If operand of BSR / BSF are proven never zero, then ZF cannot be set. + if (DAG.isKnownNeverZero(Cond.getOperand(0))) + return (CC == X86::COND_E) ? FalseOp : TrueOp; + } + } + + // If this is a select between two integer constants, try to do some + // optimizations. Note that the operands are ordered the opposite of SELECT + // operands. + if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { + if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { + // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is + // larger than FalseC (the false value). + if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { + CC = X86::GetOppositeBranchCondition(CC); + std::swap(TrueC, FalseC); + } + + // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. + // This is efficient for any integer data type (including i8/i16) and + // shift amount. + if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, MVT::i8), Cond); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); + + unsigned ShAmt = TrueC->getAPIntValue().logBase2(); + Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, + DAG.getConstant(ShAmt, MVT::i8)); + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + + // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient + // for any integer data type, including i8/i16. + if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, MVT::i8), Cond); + + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, + FalseC->getValueType(0), Cond); + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + + // Optimize cases that will turn into an LEA instruction. This requires + // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). + if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { + uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); + if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; + + bool isFastMultiplier = false; + if (Diff < 10) { + switch ((unsigned char)Diff) { + default: break; + case 1: // result = add base, cond + case 2: // result = lea base( , cond*2) + case 3: // result = lea base(cond, cond*2) + case 4: // result = lea base( , cond*4) + case 5: // result = lea base(cond, cond*4) + case 8: // result = lea base( , cond*8) + case 9: // result = lea base(cond, cond*8) + isFastMultiplier = true; + break; + } + } + + if (isFastMultiplier) { + APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); + Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, + DAG.getConstant(CC, MVT::i8), Cond); + // Zero extend the condition if needed. + Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), + Cond); + // Scale the condition by the difference. + if (Diff != 1) + Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, + DAG.getConstant(Diff, Cond.getValueType())); + + // Add the base if non-zero. + if (FalseC->getAPIntValue() != 0) + Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, + SDValue(FalseC, 0)); + if (N->getNumValues() == 2) // Dead flag value? + return DCI.CombineTo(N, Cond, SDValue()); + return Cond; + } + } + } + } + return SDValue(); +} + + +/// PerformMulCombine - Optimize a single multiply with constant into two +/// in order to implement it with two cheaper instructions, e.g. +/// LEA + SHL, LEA + LEA. +static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT != MVT::i64) + return SDValue(); + + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!C) + return SDValue(); + uint64_t MulAmt = C->getZExtValue(); + if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) + return SDValue(); + + uint64_t MulAmt1 = 0; + uint64_t MulAmt2 = 0; + if ((MulAmt % 9) == 0) { + MulAmt1 = 9; + MulAmt2 = MulAmt / 9; + } else if ((MulAmt % 5) == 0) { + MulAmt1 = 5; + MulAmt2 = MulAmt / 5; + } else if ((MulAmt % 3) == 0) { + MulAmt1 = 3; + MulAmt2 = MulAmt / 3; + } + if (MulAmt2 && + (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ + DebugLoc DL = N->getDebugLoc(); + + if (isPowerOf2_64(MulAmt2) && + !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) + // If second multiplifer is pow2, issue it first. We want the multiply by + // 3, 5, or 9 to be folded into the addressing mode unless the lone use + // is an add. + std::swap(MulAmt1, MulAmt2); + + SDValue NewMul; + if (isPowerOf2_64(MulAmt1)) + NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); + else + NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), + DAG.getConstant(MulAmt1, VT)); + + if (isPowerOf2_64(MulAmt2)) + NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, + DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); + else + NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, + DAG.getConstant(MulAmt2, VT)); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, NewMul, false); + } + return SDValue(); +} + +static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); + EVT VT = N0.getValueType(); + + // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) + // since the result of setcc_c is all zero's or all ones. + if (N1C && N0.getOpcode() == ISD::AND && + N0.getOperand(1).getOpcode() == ISD::Constant) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() == X86ISD::SETCC_CARRY || + ((N00.getOpcode() == ISD::ANY_EXTEND || + N00.getOpcode() == ISD::ZERO_EXTEND) && + N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { + APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); + APInt ShAmt = N1C->getAPIntValue(); + Mask = Mask.shl(ShAmt); + if (Mask != 0) + return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, + N00, DAG.getConstant(Mask, VT)); + } + } + + return SDValue(); +} + +/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts +/// when possible. +static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (!VT.isVector() && VT.isInteger() && + N->getOpcode() == ISD::SHL) + return PerformSHLCombine(N, DAG); + + // On X86 with SSE2 support, we can transform this to a vector shift if + // all elements are shifted by the same amount. We can't do this in legalize + // because the a constant vector is typically transformed to a constant pool + // so we have no knowledge of the shift amount. + if (!Subtarget->hasXMMInt()) + return SDValue(); + + if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) + return SDValue(); + + SDValue ShAmtOp = N->getOperand(1); + EVT EltVT = VT.getVectorElementType(); + DebugLoc DL = N->getDebugLoc(); + SDValue BaseShAmt = SDValue(); + if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned i = 0; + for (; i != NumElts; ++i) { + SDValue Arg = ShAmtOp.getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + BaseShAmt = Arg; + break; + } + for (; i != NumElts; ++i) { + SDValue Arg = ShAmtOp.getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + if (Arg != BaseShAmt) { + return SDValue(); + } + } + } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && + cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { + SDValue InVec = ShAmtOp.getOperand(0); + if (InVec.getOpcode() == ISD::BUILD_VECTOR) { + unsigned NumElts = InVec.getValueType().getVectorNumElements(); + unsigned i = 0; + for (; i != NumElts; ++i) { + SDValue Arg = InVec.getOperand(i); + if (Arg.getOpcode() == ISD::UNDEF) continue; + BaseShAmt = Arg; + break; + } + } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { + unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); + if (C->getZExtValue() == SplatIdx) + BaseShAmt = InVec.getOperand(1); + } + } + if (BaseShAmt.getNode() == 0) + BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, + DAG.getIntPtrConstant(0)); + } else + return SDValue(); + + // The shift amount is an i32. + if (EltVT.bitsGT(MVT::i32)) + BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); + else if (EltVT.bitsLT(MVT::i32)) + BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); + + // The shift amount is identical so we can do a vector shift. + SDValue ValOp = N->getOperand(0); + switch (N->getOpcode()) { + default: + llvm_unreachable("Unknown shift opcode!"); + break; + case ISD::SHL: + if (VT == MVT::v2i64) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), + ValOp, BaseShAmt); + if (VT == MVT::v4i32) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), + ValOp, BaseShAmt); + if (VT == MVT::v8i16) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), + ValOp, BaseShAmt); + break; + case ISD::SRA: + if (VT == MVT::v4i32) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), + ValOp, BaseShAmt); + if (VT == MVT::v8i16) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), + ValOp, BaseShAmt); + break; + case ISD::SRL: + if (VT == MVT::v2i64) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), + ValOp, BaseShAmt); + if (VT == MVT::v4i32) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), + ValOp, BaseShAmt); + if (VT == MVT::v8i16) + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), + ValOp, BaseShAmt); + break; + } + return SDValue(); +} + + +// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..)) +// where both setccs reference the same FP CMP, and rewrite for CMPEQSS +// and friends. Likewise for OR -> CMPNEQSS. +static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + unsigned opcode; + + // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but + // we're requiring SSE2 for both. + if (Subtarget->hasXMMInt() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDValue CMP0 = N0->getOperand(1); + SDValue CMP1 = N1->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // The SETCCs should both refer to the same CMP. + if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) + return SDValue(); + + SDValue CMP00 = CMP0->getOperand(0); + SDValue CMP01 = CMP0->getOperand(1); + EVT VT = CMP00.getValueType(); + + if (VT == MVT::f32 || VT == MVT::f64) { + bool ExpectingFlags = false; + // Check for any users that want flags: + for (SDNode::use_iterator UI = N->use_begin(), + UE = N->use_end(); + !ExpectingFlags && UI != UE; ++UI) + switch (UI->getOpcode()) { + default: + case ISD::BR_CC: + case ISD::BRCOND: + case ISD::SELECT: + ExpectingFlags = true; + break; + case ISD::CopyToReg: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + break; + } + + if (!ExpectingFlags) { + enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); + enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); + + if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { + X86::CondCode tmp = cc0; + cc0 = cc1; + cc1 = tmp; + } + + if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || + (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { + bool is64BitFP = (CMP00.getValueType() == MVT::f64); + X86ISD::NodeType NTOperator = is64BitFP ? + X86ISD::FSETCCsd : X86ISD::FSETCCss; + // FIXME: need symbolic constants for these magic numbers. + // See X86ATTInstPrinter.cpp:printSSECC(). + unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; + SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01, + DAG.getConstant(x86cc, MVT::i8)); + SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32, + OnesOrZeroesF); + SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI, + DAG.getConstant(1, MVT::i32)); + SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed); + return OneBitOfTruth; + } + } + } + } + return SDValue(); +} + +/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector +/// so it can be folded inside ANDNP. +static bool CanFoldXORWithAllOnes(const SDNode *N) { + EVT VT = N->getValueType(0); + + // Match direct AllOnes for 128 and 256-bit vectors + if (ISD::isBuildVectorAllOnes(N)) + return true; + + // Look through a bit convert. + if (N->getOpcode() == ISD::BITCAST) + N = N->getOperand(0).getNode(); + + // Sometimes the operand may come from a insert_subvector building a 256-bit + // allones vector + if (VT.getSizeInBits() == 256 && + N->getOpcode() == ISD::INSERT_SUBVECTOR) { + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + + if (V1.getOpcode() == ISD::INSERT_SUBVECTOR && + V1.getOperand(0).getOpcode() == ISD::UNDEF && + ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) && + ISD::isBuildVectorAllOnes(V2.getNode())) + return true; + } + + return false; +} + +static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); + if (R.getNode()) + return R; + + EVT VT = N->getValueType(0); + + // Create ANDN instructions + if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // Check LHS for not + if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1))) + return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1); + // Check RHS for not + if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1))) + return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0); + + return SDValue(); + } + + // Want to form ANDNP nodes: + // 1) In the hopes of then easily combining them with OR and AND nodes + // to form PBLEND/PSIGN. + // 2) To match ANDN packed intrinsics + if (VT != MVT::v2i64 && VT != MVT::v4i64) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // Check LHS for vnot + if (N0.getOpcode() == ISD::XOR && + //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) + CanFoldXORWithAllOnes(N0.getOperand(1).getNode())) + return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1); + + // Check RHS for vnot + if (N1.getOpcode() == ISD::XOR && + //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) + CanFoldXORWithAllOnes(N1.getOperand(1).getNode())) + return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0); + + return SDValue(); +} + +static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); + if (R.getNode()) + return R; + + EVT VT = N->getValueType(0); + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // look for psign/blend + if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) { + if (VT == MVT::v2i64) { + // Canonicalize pandn to RHS + if (N0.getOpcode() == X86ISD::ANDNP) + std::swap(N0, N1); + // or (and (m, x), (pandn m, y)) + if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) { + SDValue Mask = N1.getOperand(0); + SDValue X = N1.getOperand(1); + SDValue Y; + if (N0.getOperand(0) == Mask) + Y = N0.getOperand(1); + if (N0.getOperand(1) == Mask) + Y = N0.getOperand(0); + + // Check to see if the mask appeared in both the AND and ANDNP and + if (!Y.getNode()) + return SDValue(); + + // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them. + if (Mask.getOpcode() != ISD::BITCAST || + X.getOpcode() != ISD::BITCAST || + Y.getOpcode() != ISD::BITCAST) + return SDValue(); + + // Look through mask bitcast. + Mask = Mask.getOperand(0); + EVT MaskVT = Mask.getValueType(); + + // Validate that the Mask operand is a vector sra node. The sra node + // will be an intrinsic. + if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN) + return SDValue(); + + // FIXME: what to do for bytes, since there is a psignb/pblendvb, but + // there is no psrai.b + switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) { + case Intrinsic::x86_sse2_psrai_w: + case Intrinsic::x86_sse2_psrai_d: + break; + default: return SDValue(); + } + + // Check that the SRA is all signbits. + SDValue SraC = Mask.getOperand(2); + unsigned SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue(); + unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits(); + if ((SraAmt + 1) != EltBits) + return SDValue(); + + DebugLoc DL = N->getDebugLoc(); + + // Now we know we at least have a plendvb with the mask val. See if + // we can form a psignb/w/d. + // psign = x.type == y.type == mask.type && y = sub(0, x); + X = X.getOperand(0); + Y = Y.getOperand(0); + if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X && + ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) && + X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){ + unsigned Opc = 0; + switch (EltBits) { + case 8: Opc = X86ISD::PSIGNB; break; + case 16: Opc = X86ISD::PSIGNW; break; + case 32: Opc = X86ISD::PSIGND; break; + default: break; + } + if (Opc) { + SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1)); + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign); + } + } + // PBLENDVB only available on SSE 4.1 + if (!(Subtarget->hasSSE41() || Subtarget->hasAVX())) + return SDValue(); + + X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X); + Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y); + Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask); + Mask = DAG.getNode(ISD::VSELECT, DL, MVT::v16i8, Mask, X, Y); + return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask); + } + } + } + + // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) + if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) + std::swap(N0, N1); + if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) + return SDValue(); + if (!N0.hasOneUse() || !N1.hasOneUse()) + return SDValue(); + + SDValue ShAmt0 = N0.getOperand(1); + if (ShAmt0.getValueType() != MVT::i8) + return SDValue(); + SDValue ShAmt1 = N1.getOperand(1); + if (ShAmt1.getValueType() != MVT::i8) + return SDValue(); + if (ShAmt0.getOpcode() == ISD::TRUNCATE) + ShAmt0 = ShAmt0.getOperand(0); + if (ShAmt1.getOpcode() == ISD::TRUNCATE) + ShAmt1 = ShAmt1.getOperand(0); + + DebugLoc DL = N->getDebugLoc(); + unsigned Opc = X86ISD::SHLD; + SDValue Op0 = N0.getOperand(0); + SDValue Op1 = N1.getOperand(0); + if (ShAmt0.getOpcode() == ISD::SUB) { + Opc = X86ISD::SHRD; + std::swap(Op0, Op1); + std::swap(ShAmt0, ShAmt1); + } + + unsigned Bits = VT.getSizeInBits(); + if (ShAmt1.getOpcode() == ISD::SUB) { + SDValue Sum = ShAmt1.getOperand(0); + if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { + SDValue ShAmt1Op1 = ShAmt1.getOperand(1); + if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) + ShAmt1Op1 = ShAmt1Op1.getOperand(0); + if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) + return DAG.getNode(Opc, DL, VT, + Op0, Op1, + DAG.getNode(ISD::TRUNCATE, DL, + MVT::i8, ShAmt0)); + } + } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { + ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); + if (ShAmt0C && + ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) + return DAG.getNode(Opc, DL, VT, + N0.getOperand(0), N1.getOperand(0), + DAG.getNode(ISD::TRUNCATE, DL, + MVT::i8, ShAmt0)); + } + + return SDValue(); +} + +/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes. +static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + LoadSDNode *Ld = cast<LoadSDNode>(N); + EVT RegVT = Ld->getValueType(0); + EVT MemVT = Ld->getMemoryVT(); + DebugLoc dl = Ld->getDebugLoc(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + ISD::LoadExtType Ext = Ld->getExtensionType(); + + // If this is a vector EXT Load then attempt to optimize it using a + // shuffle. We need SSE4 for the shuffles. + // TODO: It is possible to support ZExt by zeroing the undef values + // during the shuffle phase or after the shuffle. + if (RegVT.isVector() && Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) { + assert(MemVT != RegVT && "Cannot extend to the same type"); + assert(MemVT.isVector() && "Must load a vector from memory"); + + unsigned NumElems = RegVT.getVectorNumElements(); + unsigned RegSz = RegVT.getSizeInBits(); + unsigned MemSz = MemVT.getSizeInBits(); + assert(RegSz > MemSz && "Register size must be greater than the mem size"); + // All sizes must be a power of two + if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue(); + + // Attempt to load the original value using a single load op. + // Find a scalar type which is equal to the loaded word size. + MVT SclrLoadTy = MVT::i8; + for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; + tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { + MVT Tp = (MVT::SimpleValueType)tp; + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() == MemSz) { + SclrLoadTy = Tp; + break; + } + } + + // Proceed if a load word is found. + if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue(); + + EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, + RegSz/SclrLoadTy.getSizeInBits()); + + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + RegSz/MemVT.getScalarType().getSizeInBits()); + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); + + // Perform a single load. + SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), + Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->isVolatile(), + Ld->isNonTemporal(), Ld->getAlignment()); + + // Insert the word loaded into a vector. + SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + LoadUnitVecVT, ScalarLoad); + + // Bitcast the loaded value to a vector of the original element type, in + // the size of the target vector type. + SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector); + unsigned SizeRatio = RegSz/MemSz; + + // Redistribute the loaded elements into the different locations. + SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i; + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(SlicedVec.getValueType()), + ShuffleVec.data()); + + // Bitcast to the requested type. + Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); + // Replace the original load with the new sequence + // and return the new chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff); + return SDValue(ScalarLoad.getNode(), 1); + } + + return SDValue(); +} + +/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. +static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + StoreSDNode *St = cast<StoreSDNode>(N); + EVT VT = St->getValue().getValueType(); + EVT StVT = St->getMemoryVT(); + DebugLoc dl = St->getDebugLoc(); + SDValue StoredVal = St->getOperand(1); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // If we are saving a concatination of two XMM registers, perform two stores. + // This is better in Sandy Bridge cause one 256-bit mem op is done via two + // 128-bit ones. If in the future the cost becomes only one memory access the + // first version would be better. + if (VT.getSizeInBits() == 256 && + StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && + StoredVal.getNumOperands() == 2) { + + SDValue Value0 = StoredVal.getOperand(0); + SDValue Value1 = StoredVal.getOperand(1); + + SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); + SDValue Ptr0 = St->getBasePtr(); + SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); + + SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); + } + + // Optimize trunc store (of multiple scalars) to shuffle and store. + // First, pack all of the elements in one place. Next, store to memory + // in fewer chunks. + if (St->isTruncatingStore() && VT.isVector()) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NumElems = VT.getVectorNumElements(); + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromSz = VT.getVectorElementType().getSizeInBits(); + unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + + // From, To sizes and ElemCount must be pow of two + if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + if (0 != (NumElems * FromSz) % ToSz) return SDValue(); + + unsigned SizeRatio = FromSz / ToSz; + + assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + StVT.getScalarType(), NumElems*SizeRatio); + + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue()); + SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio; + + // Can't shuffle using an illegal type + if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec, + DAG.getUNDEF(WideVec.getValueType()), + ShuffleVec.data()); + // At this point all of the data is stored at the bottom of the + // register. We now need to save it to mem. + + // Find the largest store unit + MVT StoreType = MVT::i8; + for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; + tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { + MVT Tp = (MVT::SimpleValueType)tp; + if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz) + StoreType = Tp; + } + + // Bitcast the original vector into a vector of store-size units + EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), + StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); + assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); + SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff); + SmallVector<SDValue, 8> Chains; + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8, + TLI.getPointerTy()); + SDValue Ptr = St->getBasePtr(); + + // Perform one or more big stores into memory. + for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) { + SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, + StoreType, ShuffWide, + DAG.getIntPtrConstant(i)); + SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + Chains.push_back(Ch); + } + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], + Chains.size()); + } + + + // Turn load->store of MMX types into GPR load/stores. This avoids clobbering + // the FP state in cases where an emms may be missing. + // A preferable solution to the general problem is to figure out the right + // places to insert EMMS. This qualifies as a quick hack. + + // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. + if (VT.getSizeInBits() != 64) + return SDValue(); + + const Function *F = DAG.getMachineFunction().getFunction(); + bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); + bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps + && Subtarget->hasXMMInt(); + if ((VT.isVector() || + (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && + isa<LoadSDNode>(St->getValue()) && + !cast<LoadSDNode>(St->getValue())->isVolatile() && + St->getChain().hasOneUse() && !St->isVolatile()) { + SDNode* LdVal = St->getValue().getNode(); + LoadSDNode *Ld = 0; + int TokenFactorIndex = -1; + SmallVector<SDValue, 8> Ops; + SDNode* ChainVal = St->getChain().getNode(); + // Must be a store of a load. We currently handle two cases: the load + // is a direct child, and it's under an intervening TokenFactor. It is + // possible to dig deeper under nested TokenFactors. + if (ChainVal == LdVal) + Ld = cast<LoadSDNode>(St->getChain()); + else if (St->getValue().hasOneUse() && + ChainVal->getOpcode() == ISD::TokenFactor) { + for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { + if (ChainVal->getOperand(i).getNode() == LdVal) { + TokenFactorIndex = i; + Ld = cast<LoadSDNode>(St->getValue()); + } else + Ops.push_back(ChainVal->getOperand(i)); + } + } + + if (!Ld || !ISD::isNormalLoad(Ld)) + return SDValue(); + + // If this is not the MMX case, i.e. we are just turning i64 load/store + // into f64 load/store, avoid the transformation if there are multiple + // uses of the loaded value. + if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) + return SDValue(); + + DebugLoc LdDL = Ld->getDebugLoc(); + DebugLoc StDL = N->getDebugLoc(); + // If we are a 64-bit capable x86, lower to a single movq load/store pair. + // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store + // pair instead. + if (Subtarget->is64Bit() || F64IsLegal) { + EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; + SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->isVolatile(), + Ld->isNonTemporal(), Ld->getAlignment()); + SDValue NewChain = NewLd.getValue(1); + if (TokenFactorIndex != -1) { + Ops.push_back(NewChain); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], + Ops.size()); + } + return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), + St->getPointerInfo(), + St->isVolatile(), St->isNonTemporal(), + St->getAlignment()); + } + + // Otherwise, lower to two pairs of 32-bit loads / stores. + SDValue LoAddr = Ld->getBasePtr(); + SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, + DAG.getConstant(4, MVT::i32)); + + SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, + Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), + Ld->getAlignment()); + SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, + Ld->getPointerInfo().getWithOffset(4), + Ld->isVolatile(), Ld->isNonTemporal(), + MinAlign(Ld->getAlignment(), 4)); + + SDValue NewChain = LoLd.getValue(1); + if (TokenFactorIndex != -1) { + Ops.push_back(LoLd); + Ops.push_back(HiLd); + NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], + Ops.size()); + } + + LoAddr = St->getBasePtr(); + HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, + DAG.getConstant(4, MVT::i32)); + + SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, + St->getPointerInfo(), + St->isVolatile(), St->isNonTemporal(), + St->getAlignment()); + SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, + St->getPointerInfo().getWithOffset(4), + St->isVolatile(), + St->isNonTemporal(), + MinAlign(St->getAlignment(), 4)); + return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); + } + return SDValue(); +} + +/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal" +/// and return the operands for the horizontal operation in LHS and RHS. A +/// horizontal operation performs the binary operation on successive elements +/// of its first operand, then on successive elements of its second operand, +/// returning the resulting values in a vector. For example, if +/// A = < float a0, float a1, float a2, float a3 > +/// and +/// B = < float b0, float b1, float b2, float b3 > +/// then the result of doing a horizontal operation on A and B is +/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. +/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form +/// A horizontal-op B, for some already available A and B, and if so then LHS is +/// set to A, RHS to B, and the routine returns 'true'. +/// Note that the binary operation should have the property that if one of the +/// operands is UNDEF then the result is UNDEF. +static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) { + // Look for the following pattern: if + // A = < float a0, float a1, float a2, float a3 > + // B = < float b0, float b1, float b2, float b3 > + // and + // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> + // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> + // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > + // which is A horizontal-op B. + + // At least one of the operands should be a vector shuffle. + if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && + RHS.getOpcode() != ISD::VECTOR_SHUFFLE) + return false; + + EVT VT = LHS.getValueType(); + unsigned N = VT.getVectorNumElements(); + + // View LHS in the form + // LHS = VECTOR_SHUFFLE A, B, LMask + // If LHS is not a shuffle then pretend it is the shuffle + // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> + // NOTE: in what follows a default initialized SDValue represents an UNDEF of + // type VT. + SDValue A, B; + SmallVector<int, 8> LMask(N); + if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { + if (LHS.getOperand(0).getOpcode() != ISD::UNDEF) + A = LHS.getOperand(0); + if (LHS.getOperand(1).getOpcode() != ISD::UNDEF) + B = LHS.getOperand(1); + cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(LMask); + } else { + if (LHS.getOpcode() != ISD::UNDEF) + A = LHS; + for (unsigned i = 0; i != N; ++i) + LMask[i] = i; + } + + // Likewise, view RHS in the form + // RHS = VECTOR_SHUFFLE C, D, RMask + SDValue C, D; + SmallVector<int, 8> RMask(N); + if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { + if (RHS.getOperand(0).getOpcode() != ISD::UNDEF) + C = RHS.getOperand(0); + if (RHS.getOperand(1).getOpcode() != ISD::UNDEF) + D = RHS.getOperand(1); + cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(RMask); + } else { + if (RHS.getOpcode() != ISD::UNDEF) + C = RHS; + for (unsigned i = 0; i != N; ++i) + RMask[i] = i; + } + + // Check that the shuffles are both shuffling the same vectors. + if (!(A == C && B == D) && !(A == D && B == C)) + return false; + + // If everything is UNDEF then bail out: it would be better to fold to UNDEF. + if (!A.getNode() && !B.getNode()) + return false; + + // If A and B occur in reverse order in RHS, then "swap" them (which means + // rewriting the mask). + if (A != C) + for (unsigned i = 0; i != N; ++i) { + unsigned Idx = RMask[i]; + if (Idx < N) + RMask[i] += N; + else if (Idx < 2*N) + RMask[i] -= N; + } + + // At this point LHS and RHS are equivalent to + // LHS = VECTOR_SHUFFLE A, B, LMask + // RHS = VECTOR_SHUFFLE A, B, RMask + // Check that the masks correspond to performing a horizontal operation. + for (unsigned i = 0; i != N; ++i) { + unsigned LIdx = LMask[i], RIdx = RMask[i]; + + // Ignore any UNDEF components. + if (LIdx >= 2*N || RIdx >= 2*N || (!A.getNode() && (LIdx < N || RIdx < N)) + || (!B.getNode() && (LIdx >= N || RIdx >= N))) + continue; + + // Check that successive elements are being operated on. If not, this is + // not a horizontal operation. + if (!(LIdx == 2*i && RIdx == 2*i + 1) && + !(isCommutative && LIdx == 2*i + 1 && RIdx == 2*i)) + return false; + } + + LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. + RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. + return true; +} + +/// PerformFADDCombine - Do target-specific dag combines on floating point adds. +static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Try to synthesize horizontal adds from adds of shuffles. + if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) && + (VT == MVT::v4f32 || VT == MVT::v2f64) && + isHorizontalBinOp(LHS, RHS, true)) + return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS); + return SDValue(); +} + +/// PerformFSUBCombine - Do target-specific dag combines on floating point subs. +static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + // Try to synthesize horizontal subs from subs of shuffles. + if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) && + (VT == MVT::v4f32 || VT == MVT::v2f64) && + isHorizontalBinOp(LHS, RHS, false)) + return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS); + return SDValue(); +} + +/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and +/// X86ISD::FXOR nodes. +static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); + // F[X]OR(0.0, x) -> x + // F[X]OR(x, 0.0) -> x + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(1); + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(0); + return SDValue(); +} + +/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. +static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { + // FAND(0.0, x) -> 0.0 + // FAND(x, 0.0) -> 0.0 + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(0); + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) + if (C->getValueAPF().isPosZero()) + return N->getOperand(1); + return SDValue(); +} + +static SDValue PerformBTCombine(SDNode *N, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + // BT ignores high bits in the bit index operand. + SDValue Op1 = N->getOperand(1); + if (Op1.hasOneUse()) { + unsigned BitWidth = Op1.getValueSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || + TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) + DCI.CommitTargetLoweringOpt(TLO); + } + return SDValue(); +} + +static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { + SDValue Op = N->getOperand(0); + if (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + EVT VT = N->getValueType(0), OpVT = Op.getValueType(); + if (Op.getOpcode() == X86ISD::VZEXT_LOAD && + VT.getVectorElementType().getSizeInBits() == + OpVT.getVectorElementType().getSizeInBits()) { + return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op); + } + return SDValue(); +} + +static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { + // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> + // (and (i32 x86isd::setcc_carry), 1) + // This eliminates the zext. This transformation is necessary because + // ISD::SETCC is always legalized to i8. + DebugLoc dl = N->getDebugLoc(); + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + if (N0.getOpcode() == ISD::AND && + N0.hasOneUse() && + N0.getOperand(0).hasOneUse()) { + SDValue N00 = N0.getOperand(0); + if (N00.getOpcode() != X86ISD::SETCC_CARRY) + return SDValue(); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); + if (!C || C->getZExtValue() != 1) + return SDValue(); + return DAG.getNode(ISD::AND, dl, VT, + DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, + N00.getOperand(0), N00.getOperand(1)), + DAG.getConstant(1, VT)); + } + + return SDValue(); +} + +// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT +static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { + unsigned X86CC = N->getConstantOperandVal(0); + SDValue EFLAG = N->getOperand(1); + DebugLoc DL = N->getDebugLoc(); + + // Materialize "setb reg" as "sbb reg,reg", since it can be extended without + // a zext and produces an all-ones bit which is more useful than 0/1 in some + // cases. + if (X86CC == X86::COND_B) + return DAG.getNode(ISD::AND, DL, MVT::i8, + DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, + DAG.getConstant(X86CC, MVT::i8), EFLAG), + DAG.getConstant(1, MVT::i8)); + + return SDValue(); +} + +static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, + const X86TargetLowering *XTLI) { + SDValue Op0 = N->getOperand(0); + // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have + // a 32-bit target where SSE doesn't support i64->FP operations. + if (Op0.getOpcode() == ISD::LOAD) { + LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); + EVT VT = Ld->getValueType(0); + if (!Ld->isVolatile() && !N->getValueType(0).isVector() && + ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && + !XTLI->getSubtarget()->is64Bit() && + !DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0), + Ld->getChain(), Op0, DAG); + DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1)); + return FILDChain; + } + } + return SDValue(); +} + +// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS +static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, + X86TargetLowering::DAGCombinerInfo &DCI) { + // If the LHS and RHS of the ADC node are zero, then it can't overflow and + // the result is either zero or one (depending on the input carry bit). + // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. + if (X86::isZeroNode(N->getOperand(0)) && + X86::isZeroNode(N->getOperand(1)) && + // We don't have a good way to replace an EFLAGS use, so only do this when + // dead right now. + SDValue(N, 1).use_empty()) { + DebugLoc DL = N->getDebugLoc(); + EVT VT = N->getValueType(0); + SDValue CarryOut = DAG.getConstant(0, N->getValueType(1)); + SDValue Res1 = DAG.getNode(ISD::AND, DL, VT, + DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getConstant(X86::COND_B,MVT::i8), + N->getOperand(2)), + DAG.getConstant(1, VT)); + return DCI.CombineTo(N, Res1, CarryOut); + } + + return SDValue(); +} + +// fold (add Y, (sete X, 0)) -> adc 0, Y +// (add Y, (setne X, 0)) -> sbb -1, Y +// (sub (sete X, 0), Y) -> sbb 0, Y +// (sub (setne X, 0), Y) -> adc -1, Y +static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) { + DebugLoc DL = N->getDebugLoc(); + + // Look through ZExts. + SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0); + if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse()) + return SDValue(); + + SDValue SetCC = Ext.getOperand(0); + if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse()) + return SDValue(); + + X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); + if (CC != X86::COND_E && CC != X86::COND_NE) + return SDValue(); + + SDValue Cmp = SetCC.getOperand(1); + if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() || + !X86::isZeroNode(Cmp.getOperand(1)) || + !Cmp.getOperand(0).getValueType().isInteger()) + return SDValue(); + + SDValue CmpOp0 = Cmp.getOperand(0); + SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0, + DAG.getConstant(1, CmpOp0.getValueType())); + + SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1); + if (CC == X86::COND_NE) + return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB, + DL, OtherVal.getValueType(), OtherVal, + DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp); + return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC, + DL, OtherVal.getValueType(), OtherVal, + DAG.getConstant(0, OtherVal.getValueType()), NewCmp); +} + +static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) { + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // X86 can't encode an immediate LHS of a sub. See if we can push the + // negation into a preceding instruction. + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) { + // If the RHS of the sub is a XOR with one use and a constant, invert the + // immediate. Then add one to the LHS of the sub so we can turn + // X-Y -> X+~Y+1, saving one register. + if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR && + isa<ConstantSDNode>(Op1.getOperand(1))) { + APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue(); + EVT VT = Op0.getValueType(); + SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT, + Op1.getOperand(0), + DAG.getConstant(~XorC, VT)); + return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor, + DAG.getConstant(C->getAPIntValue()+1, VT)); + } + } + + return OptimizeConditionalInDecrement(N, DAG); +} + +SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + switch (N->getOpcode()) { + default: break; + case ISD::EXTRACT_VECTOR_ELT: + return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); + case ISD::VSELECT: + case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); + case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); + case ISD::ADD: return OptimizeConditionalInDecrement(N, DAG); + case ISD::SUB: return PerformSubCombine(N, DAG); + case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); + case ISD::MUL: return PerformMulCombine(N, DAG, DCI); + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); + case ISD::AND: return PerformAndCombine(N, DAG, DCI, Subtarget); + case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); + case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget); + case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); + case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); + case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); + case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); + case X86ISD::FXOR: + case X86ISD::FOR: return PerformFORCombine(N, DAG); + case X86ISD::FAND: return PerformFANDCombine(N, DAG); + case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); + case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); + case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); + case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); + case X86ISD::SHUFPS: // Handle all target specific shuffles + case X86ISD::SHUFPD: + case X86ISD::PALIGN: + case X86ISD::PUNPCKHBW: + case X86ISD::PUNPCKHWD: + case X86ISD::PUNPCKHDQ: + case X86ISD::PUNPCKHQDQ: + case X86ISD::UNPCKHPS: + case X86ISD::UNPCKHPD: + case X86ISD::VUNPCKHPSY: + case X86ISD::VUNPCKHPDY: + case X86ISD::PUNPCKLBW: + case X86ISD::PUNPCKLWD: + case X86ISD::PUNPCKLDQ: + case X86ISD::PUNPCKLQDQ: + case X86ISD::UNPCKLPS: + case X86ISD::UNPCKLPD: + case X86ISD::VUNPCKLPSY: + case X86ISD::VUNPCKLPDY: + case X86ISD::MOVHLPS: + case X86ISD::MOVLHPS: + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + case X86ISD::MOVSS: + case X86ISD::MOVSD: + case X86ISD::VPERMILPS: + case X86ISD::VPERMILPSY: + case X86ISD::VPERMILPD: + case X86ISD::VPERMILPDY: + case X86ISD::VPERM2F128: + case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); + } + + return SDValue(); +} + +/// isTypeDesirableForOp - Return true if the target has native support for +/// the specified value type and it is 'desirable' to use the type for the +/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 +/// instruction encodings are longer and some i16 instructions are slow. +bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { + if (!isTypeLegal(VT)) + return false; + if (VT != MVT::i16) + return true; + + switch (Opc) { + default: + return true; + case ISD::LOAD: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + case ISD::SHL: + case ISD::SRL: + case ISD::SUB: + case ISD::ADD: + case ISD::MUL: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + return false; + } +} + +/// IsDesirableToPromoteOp - This method query the target whether it is +/// beneficial for dag combiner to promote the specified node. If true, it +/// should return the desired promotion type by reference. +bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { + EVT VT = Op.getValueType(); + if (VT != MVT::i16) + return false; + + bool Promote = false; + bool Commute = false; + switch (Op.getOpcode()) { + default: break; + case ISD::LOAD: { + LoadSDNode *LD = cast<LoadSDNode>(Op); + // If the non-extending load has a single use and it's not live out, then it + // might be folded. + if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& + Op.hasOneUse()*/) { + for (SDNode::use_iterator UI = Op.getNode()->use_begin(), + UE = Op.getNode()->use_end(); UI != UE; ++UI) { + // The only case where we'd want to promote LOAD (rather then it being + // promoted as an operand is when it's only use is liveout. + if (UI->getOpcode() != ISD::CopyToReg) + return false; + } + } + Promote = true; + break; + } + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::ANY_EXTEND: + Promote = true; + break; + case ISD::SHL: + case ISD::SRL: { + SDValue N0 = Op.getOperand(0); + // Look out for (store (shl (load), x)). + if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) + return false; + Promote = true; + break; + } + case ISD::ADD: + case ISD::MUL: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + Commute = true; + // fallthrough + case ISD::SUB: { + SDValue N0 = Op.getOperand(0); + SDValue N1 = Op.getOperand(1); + if (!Commute && MayFoldLoad(N1)) + return false; + // Avoid disabling potential load folding opportunities. + if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) + return false; + if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) + return false; + Promote = true; + } + } + + PVT = MVT::i32; + return Promote; +} + +//===----------------------------------------------------------------------===// +// X86 Inline Assembly Support +//===----------------------------------------------------------------------===// + +bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { + InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); + + std::string AsmStr = IA->getAsmString(); + + // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" + SmallVector<StringRef, 4> AsmPieces; + SplitString(AsmStr, AsmPieces, ";\n"); + + switch (AsmPieces.size()) { + default: return false; + case 1: + AsmStr = AsmPieces[0]; + AsmPieces.clear(); + SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. + + // FIXME: this should verify that we are targeting a 486 or better. If not, + // we will turn this bswap into something that will be lowered to logical ops + // instead of emitting the bswap asm. For now, we don't support 486 or lower + // so don't worry about this. + // bswap $0 + if (AsmPieces.size() == 2 && + (AsmPieces[0] == "bswap" || + AsmPieces[0] == "bswapq" || + AsmPieces[0] == "bswapl") && + (AsmPieces[1] == "$0" || + AsmPieces[1] == "${0:q}")) { + // No need to check constraints, nothing other than the equivalent of + // "=r,0" would be valid here. + IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (!Ty || Ty->getBitWidth() % 16 != 0) + return false; + return IntrinsicLowering::LowerToByteSwap(CI); + } + // rorw $$8, ${0:w} --> llvm.bswap.i16 + if (CI->getType()->isIntegerTy(16) && + AsmPieces.size() == 3 && + (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && + AsmPieces[1] == "$$8," && + AsmPieces[2] == "${0:w}" && + IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { + AsmPieces.clear(); + const std::string &ConstraintsStr = IA->getConstraintString(); + SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); + std::sort(AsmPieces.begin(), AsmPieces.end()); + if (AsmPieces.size() == 4 && + AsmPieces[0] == "~{cc}" && + AsmPieces[1] == "~{dirflag}" && + AsmPieces[2] == "~{flags}" && + AsmPieces[3] == "~{fpsr}") { + IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (!Ty || Ty->getBitWidth() % 16 != 0) + return false; + return IntrinsicLowering::LowerToByteSwap(CI); + } + } + break; + case 3: + if (CI->getType()->isIntegerTy(32) && + IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { + SmallVector<StringRef, 4> Words; + SplitString(AsmPieces[0], Words, " \t,"); + if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && + Words[2] == "${0:w}") { + Words.clear(); + SplitString(AsmPieces[1], Words, " \t,"); + if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && + Words[2] == "$0") { + Words.clear(); + SplitString(AsmPieces[2], Words, " \t,"); + if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && + Words[2] == "${0:w}") { + AsmPieces.clear(); + const std::string &ConstraintsStr = IA->getConstraintString(); + SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); + std::sort(AsmPieces.begin(), AsmPieces.end()); + if (AsmPieces.size() == 4 && + AsmPieces[0] == "~{cc}" && + AsmPieces[1] == "~{dirflag}" && + AsmPieces[2] == "~{flags}" && + AsmPieces[3] == "~{fpsr}") { + IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (!Ty || Ty->getBitWidth() % 16 != 0) + return false; + return IntrinsicLowering::LowerToByteSwap(CI); + } + } + } + } + } + + if (CI->getType()->isIntegerTy(64)) { + InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); + if (Constraints.size() >= 2 && + Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && + Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { + // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 + SmallVector<StringRef, 4> Words; + SplitString(AsmPieces[0], Words, " \t"); + if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { + Words.clear(); + SplitString(AsmPieces[1], Words, " \t"); + if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { + Words.clear(); + SplitString(AsmPieces[2], Words, " \t,"); + if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && + Words[2] == "%edx") { + IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); + if (!Ty || Ty->getBitWidth() % 16 != 0) + return false; + return IntrinsicLowering::LowerToByteSwap(CI); + } + } + } + } + } + break; + } + return false; +} + + + +/// getConstraintType - Given a constraint letter, return the type of +/// constraint it is for this target. +X86TargetLowering::ConstraintType +X86TargetLowering::getConstraintType(const std::string &Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 'R': + case 'q': + case 'Q': + case 'f': + case 't': + case 'u': + case 'y': + case 'x': + case 'Y': + case 'l': + return C_RegisterClass; + case 'a': + case 'b': + case 'c': + case 'd': + case 'S': + case 'D': + case 'A': + return C_Register; + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'G': + case 'C': + case 'e': + case 'Z': + return C_Other; + default: + break; + } + } + return TargetLowering::getConstraintType(Constraint); +} + +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight + X86TargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (CallOperandVal == NULL) + return CW_Default; + Type *type = CallOperandVal->getType(); + // Look at the constraint type. + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + case 'R': + case 'q': + case 'Q': + case 'a': + case 'b': + case 'c': + case 'd': + case 'S': + case 'D': + case 'A': + if (CallOperandVal->getType()->isIntegerTy()) + weight = CW_SpecificReg; + break; + case 'f': + case 't': + case 'u': + if (type->isFloatingPointTy()) + weight = CW_SpecificReg; + break; + case 'y': + if (type->isX86_MMXTy() && Subtarget->hasMMX()) + weight = CW_SpecificReg; + break; + case 'x': + case 'Y': + if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) + weight = CW_Register; + break; + case 'I': + if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { + if (C->getZExtValue() <= 31) + weight = CW_Constant; + } + break; + case 'J': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if (C->getZExtValue() <= 63) + weight = CW_Constant; + } + break; + case 'K': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) + weight = CW_Constant; + } + break; + case 'L': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) + weight = CW_Constant; + } + break; + case 'M': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if (C->getZExtValue() <= 3) + weight = CW_Constant; + } + break; + case 'N': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if (C->getZExtValue() <= 0xff) + weight = CW_Constant; + } + break; + case 'G': + case 'C': + if (dyn_cast<ConstantFP>(CallOperandVal)) { + weight = CW_Constant; + } + break; + case 'e': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if ((C->getSExtValue() >= -0x80000000LL) && + (C->getSExtValue() <= 0x7fffffffLL)) + weight = CW_Constant; + } + break; + case 'Z': + if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { + if (C->getZExtValue() <= 0xffffffff) + weight = CW_Constant; + } + break; + } + return weight; +} + +/// LowerXConstraint - try to replace an X constraint, which matches anything, +/// with another that has more specific requirements based on the type of the +/// corresponding operand. +const char *X86TargetLowering:: +LowerXConstraint(EVT ConstraintVT) const { + // FP X constraints get lowered to SSE1/2 registers if available, otherwise + // 'f' like normal targets. + if (ConstraintVT.isFloatingPoint()) { + if (Subtarget->hasXMMInt()) + return "Y"; + if (Subtarget->hasXMM()) + return "x"; + } + + return TargetLowering::LowerXConstraint(ConstraintVT); +} + +/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops +/// vector. If it is invalid, don't add anything to Ops. +void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue>&Ops, + SelectionDAG &DAG) const { + SDValue Result(0, 0); + + // Only support length 1 constraints for now. + if (Constraint.length() > 1) return; + + char ConstraintLetter = Constraint[0]; + switch (ConstraintLetter) { + default: break; + case 'I': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 31) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + return; + case 'J': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 63) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + return; + case 'K': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if ((int8_t)C->getSExtValue() == C->getSExtValue()) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + return; + case 'N': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 255) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + return; + case 'e': { + // 32-bit signed value + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), + C->getSExtValue())) { + // Widen to 64 bits here to get it sign extended. + Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); + break; + } + // FIXME gcc accepts some relocatable values here too, but only in certain + // memory models; it's complicated. + } + return; + } + case 'Z': { + // 32-bit unsigned value + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), + C->getZExtValue())) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + // FIXME gcc accepts some relocatable values here too, but only in certain + // memory models; it's complicated. + return; + } + case 'i': { + // Literal immediates are always ok. + if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { + // Widen to 64 bits here to get it sign extended. + Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); + break; + } + + // In any sort of PIC mode addresses need to be computed at runtime by + // adding in a register or some sort of table lookup. These can't + // be used as immediates. + if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) + return; + + // If we are in non-pic codegen mode, we allow the address of a global (with + // an optional displacement) to be used with 'i'. + GlobalAddressSDNode *GA = 0; + int64_t Offset = 0; + + // Match either (GA), (GA+C), (GA+C1+C2), etc. + while (1) { + if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { + Offset += GA->getOffset(); + break; + } else if (Op.getOpcode() == ISD::ADD) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + Offset += C->getZExtValue(); + Op = Op.getOperand(0); + continue; + } + } else if (Op.getOpcode() == ISD::SUB) { + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + Offset += -C->getZExtValue(); + Op = Op.getOperand(0); + continue; + } + } + + // Otherwise, this isn't something we can handle, reject it. + return; + } + + const GlobalValue *GV = GA->getGlobal(); + // If we require an extra load to get this address, as in PIC mode, we + // can't accept it. + if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, + getTargetMachine()))) + return; + + Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), + GA->getValueType(0), Offset); + break; + } + } + + if (Result.getNode()) { + Ops.push_back(Result); + return; + } + return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); +} + +std::pair<unsigned, const TargetRegisterClass*> +X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, + EVT VT) const { + // First, see if this is a constraint that directly corresponds to an LLVM + // register class. + if (Constraint.size() == 1) { + // GCC Constraint Letters + switch (Constraint[0]) { + default: break; + // TODO: Slight differences here in allocation order and leaving + // RIP in the class. Do they matter any more here than they do + // in the normal allocation? + case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. + if (Subtarget->is64Bit()) { + if (VT == MVT::i32 || VT == MVT::f32) + return std::make_pair(0U, X86::GR32RegisterClass); + else if (VT == MVT::i16) + return std::make_pair(0U, X86::GR16RegisterClass); + else if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, X86::GR8RegisterClass); + else if (VT == MVT::i64 || VT == MVT::f64) + return std::make_pair(0U, X86::GR64RegisterClass); + break; + } + // 32-bit fallthrough + case 'Q': // Q_REGS + if (VT == MVT::i32 || VT == MVT::f32) + return std::make_pair(0U, X86::GR32_ABCDRegisterClass); + else if (VT == MVT::i16) + return std::make_pair(0U, X86::GR16_ABCDRegisterClass); + else if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, X86::GR8_ABCD_LRegisterClass); + else if (VT == MVT::i64) + return std::make_pair(0U, X86::GR64_ABCDRegisterClass); + break; + case 'r': // GENERAL_REGS + case 'l': // INDEX_REGS + if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, X86::GR8RegisterClass); + if (VT == MVT::i16) + return std::make_pair(0U, X86::GR16RegisterClass); + if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit()) + return std::make_pair(0U, X86::GR32RegisterClass); + return std::make_pair(0U, X86::GR64RegisterClass); + case 'R': // LEGACY_REGS + if (VT == MVT::i8 || VT == MVT::i1) + return std::make_pair(0U, X86::GR8_NOREXRegisterClass); + if (VT == MVT::i16) + return std::make_pair(0U, X86::GR16_NOREXRegisterClass); + if (VT == MVT::i32 || !Subtarget->is64Bit()) + return std::make_pair(0U, X86::GR32_NOREXRegisterClass); + return std::make_pair(0U, X86::GR64_NOREXRegisterClass); + case 'f': // FP Stack registers. + // If SSE is enabled for this VT, use f80 to ensure the isel moves the + // value to the correct fpstack register class. + if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) + return std::make_pair(0U, X86::RFP32RegisterClass); + if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) + return std::make_pair(0U, X86::RFP64RegisterClass); + return std::make_pair(0U, X86::RFP80RegisterClass); + case 'y': // MMX_REGS if MMX allowed. + if (!Subtarget->hasMMX()) break; + return std::make_pair(0U, X86::VR64RegisterClass); + case 'Y': // SSE_REGS if SSE2 allowed + if (!Subtarget->hasXMMInt()) break; + // FALL THROUGH. + case 'x': // SSE_REGS if SSE1 allowed + if (!Subtarget->hasXMM()) break; + + switch (VT.getSimpleVT().SimpleTy) { + default: break; + // Scalar SSE types. + case MVT::f32: + case MVT::i32: + return std::make_pair(0U, X86::FR32RegisterClass); + case MVT::f64: + case MVT::i64: + return std::make_pair(0U, X86::FR64RegisterClass); + // Vector types. + case MVT::v16i8: + case MVT::v8i16: + case MVT::v4i32: + case MVT::v2i64: + case MVT::v4f32: + case MVT::v2f64: + return std::make_pair(0U, X86::VR128RegisterClass); + } + break; + } + } + + // Use the default implementation in TargetLowering to convert the register + // constraint into a member of a register class. + std::pair<unsigned, const TargetRegisterClass*> Res; + Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); + + // Not found as a standard register? + if (Res.second == 0) { + // Map st(0) -> st(7) -> ST0 + if (Constraint.size() == 7 && Constraint[0] == '{' && + tolower(Constraint[1]) == 's' && + tolower(Constraint[2]) == 't' && + Constraint[3] == '(' && + (Constraint[4] >= '0' && Constraint[4] <= '7') && + Constraint[5] == ')' && + Constraint[6] == '}') { + + Res.first = X86::ST0+Constraint[4]-'0'; + Res.second = X86::RFP80RegisterClass; + return Res; + } + + // GCC allows "st(0)" to be called just plain "st". + if (StringRef("{st}").equals_lower(Constraint)) { + Res.first = X86::ST0; + Res.second = X86::RFP80RegisterClass; + return Res; + } + + // flags -> EFLAGS + if (StringRef("{flags}").equals_lower(Constraint)) { + Res.first = X86::EFLAGS; + Res.second = X86::CCRRegisterClass; + return Res; + } + + // 'A' means EAX + EDX. + if (Constraint == "A") { + Res.first = X86::EAX; + Res.second = X86::GR32_ADRegisterClass; + return Res; + } + return Res; + } + + // Otherwise, check to see if this is a register class of the wrong value + // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to + // turn into {ax},{dx}. + if (Res.second->hasType(VT)) + return Res; // Correct type already, nothing to do. + + // All of the single-register GCC register classes map their values onto + // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we + // really want an 8-bit or 32-bit register, map to the appropriate register + // class and return the appropriate register. + if (Res.second == X86::GR16RegisterClass) { + if (VT == MVT::i8) { + unsigned DestReg = 0; + switch (Res.first) { + default: break; + case X86::AX: DestReg = X86::AL; break; + case X86::DX: DestReg = X86::DL; break; + case X86::CX: DestReg = X86::CL; break; + case X86::BX: DestReg = X86::BL; break; + } + if (DestReg) { + Res.first = DestReg; + Res.second = X86::GR8RegisterClass; + } + } else if (VT == MVT::i32) { + unsigned DestReg = 0; + switch (Res.first) { + default: break; + case X86::AX: DestReg = X86::EAX; break; + case X86::DX: DestReg = X86::EDX; break; + case X86::CX: DestReg = X86::ECX; break; + case X86::BX: DestReg = X86::EBX; break; + case X86::SI: DestReg = X86::ESI; break; + case X86::DI: DestReg = X86::EDI; break; + case X86::BP: DestReg = X86::EBP; break; + case X86::SP: DestReg = X86::ESP; break; + } + if (DestReg) { + Res.first = DestReg; + Res.second = X86::GR32RegisterClass; + } + } else if (VT == MVT::i64) { + unsigned DestReg = 0; + switch (Res.first) { + default: break; + case X86::AX: DestReg = X86::RAX; break; + case X86::DX: DestReg = X86::RDX; break; + case X86::CX: DestReg = X86::RCX; break; + case X86::BX: DestReg = X86::RBX; break; + case X86::SI: DestReg = X86::RSI; break; + case X86::DI: DestReg = X86::RDI; break; + case X86::BP: DestReg = X86::RBP; break; + case X86::SP: DestReg = X86::RSP; break; + } + if (DestReg) { + Res.first = DestReg; + Res.second = X86::GR64RegisterClass; + } + } + } else if (Res.second == X86::FR32RegisterClass || + Res.second == X86::FR64RegisterClass || + Res.second == X86::VR128RegisterClass) { + // Handle references to XMM physical registers that got mapped into the + // wrong class. This can happen with constraints like {xmm0} where the + // target independent register mapper will just pick the first match it can + // find, ignoring the required type. + if (VT == MVT::f32) + Res.second = X86::FR32RegisterClass; + else if (VT == MVT::f64) + Res.second = X86::FR64RegisterClass; + else if (X86::VR128RegisterClass->hasType(VT)) + Res.second = X86::VR128RegisterClass; + } + + return Res; +} diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h new file mode 100644 index 0000000..342a5e6 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -0,0 +1,979 @@ +//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that X86 uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef X86ISELLOWERING_H +#define X86ISELLOWERING_H + +#include "X86Subtarget.h" +#include "X86RegisterInfo.h" +#include "X86MachineFunctionInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/CallingConvLower.h" + +namespace llvm { + namespace X86ISD { + // X86 Specific DAG Nodes + enum NodeType { + // Start the numbering where the builtin ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END, + + /// BSF - Bit scan forward. + /// BSR - Bit scan reverse. + BSF, + BSR, + + /// SHLD, SHRD - Double shift instructions. These correspond to + /// X86::SHLDxx and X86::SHRDxx instructions. + SHLD, + SHRD, + + /// FAND - Bitwise logical AND of floating point values. This corresponds + /// to X86::ANDPS or X86::ANDPD. + FAND, + + /// FOR - Bitwise logical OR of floating point values. This corresponds + /// to X86::ORPS or X86::ORPD. + FOR, + + /// FXOR - Bitwise logical XOR of floating point values. This corresponds + /// to X86::XORPS or X86::XORPD. + FXOR, + + /// FSRL - Bitwise logical right shift of floating point values. These + /// corresponds to X86::PSRLDQ. + FSRL, + + /// CALL - These operations represent an abstract X86 call + /// instruction, which includes a bunch of information. In particular the + /// operands of these node are: + /// + /// #0 - The incoming token chain + /// #1 - The callee + /// #2 - The number of arg bytes the caller pushes on the stack. + /// #3 - The number of arg bytes the callee pops off the stack. + /// #4 - The value to pass in AL/AX/EAX (optional) + /// #5 - The value to pass in DL/DX/EDX (optional) + /// + /// The result values of these nodes are: + /// + /// #0 - The outgoing token chain + /// #1 - The first register result value (optional) + /// #2 - The second register result value (optional) + /// + CALL, + + /// RDTSC_DAG - This operation implements the lowering for + /// readcyclecounter + RDTSC_DAG, + + /// X86 compare and logical compare instructions. + CMP, COMI, UCOMI, + + /// X86 bit-test instructions. + BT, + + /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS + /// operand, usually produced by a CMP instruction. + SETCC, + + // Same as SETCC except it's materialized with a sbb and the value is all + // one's or all zero's. + SETCC_CARRY, // R = carry_bit ? ~0 : 0 + + /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. + /// Operands are two FP values to compare; result is a mask of + /// 0s or 1s. Generally DTRT for C/C++ with NaNs. + FSETCCss, FSETCCsd, + + /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values, + /// result in an integer GPR. Needs masking for scalar result. + FGETSIGNx86, + + /// X86 conditional moves. Operand 0 and operand 1 are the two values + /// to select from. Operand 2 is the condition code, and operand 3 is the + /// flag operand produced by a CMP or TEST instruction. It also writes a + /// flag result. + CMOV, + + /// X86 conditional branches. Operand 0 is the chain operand, operand 1 + /// is the block to branch if condition is true, operand 2 is the + /// condition code, and operand 3 is the flag operand produced by a CMP + /// or TEST instruction. + BRCOND, + + /// Return with a flag operand. Operand 0 is the chain operand, operand + /// 1 is the number of bytes of stack to pop. + RET_FLAG, + + /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx. + REP_STOS, + + /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx. + REP_MOVS, + + /// GlobalBaseReg - On Darwin, this node represents the result of the popl + /// at function entry, used for PIC code. + GlobalBaseReg, + + /// Wrapper - A wrapper node for TargetConstantPool, + /// TargetExternalSymbol, and TargetGlobalAddress. + Wrapper, + + /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP + /// relative displacements. + WrapperRIP, + + /// MOVQ2DQ - Copies a 64-bit value from an MMX vector to the low word + /// of an XMM vector, with the high word zero filled. + MOVQ2DQ, + + /// MOVDQ2Q - Copies a 64-bit value from the low word of an XMM vector + /// to an MMX vector. If you think this is too close to the previous + /// mnemonic, so do I; blame Intel. + MOVDQ2Q, + + /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to + /// i32, corresponds to X86::PEXTRB. + PEXTRB, + + /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to + /// i32, corresponds to X86::PEXTRW. + PEXTRW, + + /// INSERTPS - Insert any element of a 4 x float vector into any element + /// of a destination 4 x floatvector. + INSERTPS, + + /// PINSRB - Insert the lower 8-bits of a 32-bit value to a vector, + /// corresponds to X86::PINSRB. + PINSRB, + + /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector, + /// corresponds to X86::PINSRW. + PINSRW, MMX_PINSRW, + + /// PSHUFB - Shuffle 16 8-bit values within a vector. + PSHUFB, + + /// ANDNP - Bitwise Logical AND NOT of Packed FP values. + ANDNP, + + /// PSIGNB/W/D - Copy integer sign. + PSIGNB, PSIGNW, PSIGND, + + /// BLEND family of opcodes + BLENDV, + + /// FHADD - Floating point horizontal add. + FHADD, + + /// FHSUB - Floating point horizontal sub. + FHSUB, + + /// FMAX, FMIN - Floating point max and min. + /// + FMAX, FMIN, + + /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal + /// approximation. Note that these typically require refinement + /// in order to obtain suitable precision. + FRSQRT, FRCP, + + // TLSADDR - Thread Local Storage. + TLSADDR, + + // TLSCALL - Thread Local Storage. When calling to an OS provided + // thunk at the address from an earlier relocation. + TLSCALL, + + // EH_RETURN - Exception Handling helpers. + EH_RETURN, + + /// TC_RETURN - Tail call return. + /// operand #0 chain + /// operand #1 callee (register or absolute) + /// operand #2 stack adjustment + /// operand #3 optional in flag + TC_RETURN, + + // VZEXT_MOVL - Vector move low and zero extend. + VZEXT_MOVL, + + // VSHL, VSRL - Vector logical left / right shift. + VSHL, VSRL, + + // CMPPD, CMPPS - Vector double/float comparison. + // CMPPD, CMPPS - Vector double/float comparison. + CMPPD, CMPPS, + + // PCMP* - Vector integer comparisons. + PCMPEQB, PCMPEQW, PCMPEQD, PCMPEQQ, + PCMPGTB, PCMPGTW, PCMPGTD, PCMPGTQ, + + // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results. + ADD, SUB, ADC, SBB, SMUL, + INC, DEC, OR, XOR, AND, + + ANDN, // ANDN - Bitwise AND NOT with FLAGS results. + + UMUL, // LOW, HI, FLAGS = umul LHS, RHS + + // MUL_IMM - X86 specific multiply by immediate. + MUL_IMM, + + // PTEST - Vector bitwise comparisons + PTEST, + + // TESTP - Vector packed fp sign bitwise comparisons + TESTP, + + // Several flavors of instructions with vector shuffle behaviors. + PALIGN, + PSHUFD, + PSHUFHW, + PSHUFLW, + PSHUFHW_LD, + PSHUFLW_LD, + SHUFPD, + SHUFPS, + MOVDDUP, + MOVSHDUP, + MOVSLDUP, + MOVSHDUP_LD, + MOVSLDUP_LD, + MOVLHPS, + MOVLHPD, + MOVHLPS, + MOVHLPD, + MOVLPS, + MOVLPD, + MOVSD, + MOVSS, + UNPCKLPS, + UNPCKLPD, + VUNPCKLPSY, + VUNPCKLPDY, + UNPCKHPS, + UNPCKHPD, + VUNPCKHPSY, + VUNPCKHPDY, + PUNPCKLBW, + PUNPCKLWD, + PUNPCKLDQ, + PUNPCKLQDQ, + PUNPCKHBW, + PUNPCKHWD, + PUNPCKHDQ, + PUNPCKHQDQ, + VPERMILPS, + VPERMILPSY, + VPERMILPD, + VPERMILPDY, + VPERM2F128, + VBROADCAST, + + // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack, + // according to %al. An operator is needed so that this can be expanded + // with control flow. + VASTART_SAVE_XMM_REGS, + + // WIN_ALLOCA - Windows's _chkstk call to do stack probing. + WIN_ALLOCA, + + // SEG_ALLOCA - For allocating variable amounts of stack space when using + // segmented stacks. Check if the current stacklet has enough space, and + // falls back to heap allocation if not. + SEG_ALLOCA, + + // Memory barrier + MEMBARRIER, + MFENCE, + SFENCE, + LFENCE, + + // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG, + // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG - + // Atomic 64-bit binary operations. + ATOMADD64_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, + ATOMSUB64_DAG, + ATOMOR64_DAG, + ATOMXOR64_DAG, + ATOMAND64_DAG, + ATOMNAND64_DAG, + ATOMSWAP64_DAG, + + // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap. + LCMPXCHG_DAG, + LCMPXCHG8_DAG, + LCMPXCHG16_DAG, + + // VZEXT_LOAD - Load, scalar_to_vector, and zero extend. + VZEXT_LOAD, + + // FNSTCW16m - Store FP control world into i16 memory. + FNSTCW16m, + + /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the + /// integer destination in memory and a FP reg source. This corresponds + /// to the X86::FIST*m instructions and the rounding mode change stuff. It + /// has two inputs (token chain and address) and two outputs (int value + /// and token chain). + FP_TO_INT16_IN_MEM, + FP_TO_INT32_IN_MEM, + FP_TO_INT64_IN_MEM, + + /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the + /// integer source in memory and FP reg result. This corresponds to the + /// X86::FILD*m instructions. It has three inputs (token chain, address, + /// and source type) and two outputs (FP value and token chain). FILD_FLAG + /// also produces a flag). + FILD, + FILD_FLAG, + + /// FLD - This instruction implements an extending load to FP stack slots. + /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain + /// operand, ptr to load from, and a ValueType node indicating the type + /// to load to. + FLD, + + /// FST - This instruction implements a truncating store to FP stack + /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a + /// chain operand, value to store, address, and a ValueType to store it + /// as. + FST, + + /// VAARG_64 - This instruction grabs the address of the next argument + /// from a va_list. (reads and modifies the va_list in memory) + VAARG_64 + + // WARNING: Do not add anything in the end unless you want the node to + // have memop! In fact, starting from ATOMADD64_DAG all opcodes will be + // thought as target memory ops! + }; + } + + /// Define some predicates that are used for node matching. + namespace X86 { + /// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to PSHUFD. + bool isPSHUFDMask(ShuffleVectorSDNode *N); + + /// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to PSHUFD. + bool isPSHUFHWMask(ShuffleVectorSDNode *N); + + /// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to PSHUFD. + bool isPSHUFLWMask(ShuffleVectorSDNode *N); + + /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to SHUFP*. + bool isSHUFPMask(ShuffleVectorSDNode *N); + + /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to MOVHLPS. + bool isMOVHLPSMask(ShuffleVectorSDNode *N); + + /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form + /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, + /// <2, 3, 2, 3> + bool isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N); + + /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for MOVLP{S|D}. + bool isMOVLPMask(ShuffleVectorSDNode *N); + + /// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for MOVHP{S|D}. + /// as well as MOVLHPS. + bool isMOVLHPSMask(ShuffleVectorSDNode *N); + + /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to UNPCKL. + bool isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat = false); + + /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to UNPCKH. + bool isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat = false); + + /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form + /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, + /// <0, 0, 1, 1> + bool isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N); + + /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form + /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, + /// <2, 2, 3, 3> + bool isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N); + + /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to MOVSS, + /// MOVSD, and MOVD, i.e. setting the lowest element. + bool isMOVLMask(ShuffleVectorSDNode *N); + + /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to MOVSHDUP. + bool isMOVSHDUPMask(ShuffleVectorSDNode *N, const X86Subtarget *Subtarget); + + /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to MOVSLDUP. + bool isMOVSLDUPMask(ShuffleVectorSDNode *N, const X86Subtarget *Subtarget); + + /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand + /// specifies a shuffle of elements that is suitable for input to MOVDDUP. + bool isMOVDDUPMask(ShuffleVectorSDNode *N); + + /// isVEXTRACTF128Index - Return true if the specified + /// EXTRACT_SUBVECTOR operand specifies a vector extract that is + /// suitable for input to VEXTRACTF128. + bool isVEXTRACTF128Index(SDNode *N); + + /// isVINSERTF128Index - Return true if the specified + /// INSERT_SUBVECTOR operand specifies a subvector insert that is + /// suitable for input to VINSERTF128. + bool isVINSERTF128Index(SDNode *N); + + /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle + /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP* + /// instructions. + unsigned getShuffleSHUFImmediate(SDNode *N); + + /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle + /// the specified VECTOR_SHUFFLE mask with PSHUFHW instruction. + unsigned getShufflePSHUFHWImmediate(SDNode *N); + + /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle + /// the specified VECTOR_SHUFFLE mask with PSHUFLW instruction. + unsigned getShufflePSHUFLWImmediate(SDNode *N); + + /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle + /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. + unsigned getShufflePALIGNRImmediate(SDNode *N); + + /// getExtractVEXTRACTF128Immediate - Return the appropriate + /// immediate to extract the specified EXTRACT_SUBVECTOR index + /// with VEXTRACTF128 instructions. + unsigned getExtractVEXTRACTF128Immediate(SDNode *N); + + /// getInsertVINSERTF128Immediate - Return the appropriate + /// immediate to insert at the specified INSERT_SUBVECTOR index + /// with VINSERTF128 instructions. + unsigned getInsertVINSERTF128Immediate(SDNode *N); + + /// isZeroNode - Returns true if Elt is a constant zero or a floating point + /// constant +0.0. + bool isZeroNode(SDValue Elt); + + /// isOffsetSuitableForCodeModel - Returns true of the given offset can be + /// fit into displacement field of the instruction. + bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, + bool hasSymbolicDisplacement = true); + + + /// isCalleePop - Determines whether the callee is required to pop its + /// own arguments. Callee pop is necessary to support tail calls. + bool isCalleePop(CallingConv::ID CallingConv, + bool is64Bit, bool IsVarArg, bool TailCallOpt); + } + + //===--------------------------------------------------------------------===// + // X86TargetLowering - X86 Implementation of the TargetLowering interface + class X86TargetLowering : public TargetLowering { + public: + explicit X86TargetLowering(X86TargetMachine &TM); + + virtual unsigned getJumpTableEncoding() const; + + virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i8; } + + virtual const MCExpr * + LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, + const MachineBasicBlock *MBB, unsigned uid, + MCContext &Ctx) const; + + /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC + /// jumptable. + virtual SDValue getPICJumpTableRelocBase(SDValue Table, + SelectionDAG &DAG) const; + virtual const MCExpr * + getPICJumpTableRelocBaseExpr(const MachineFunction *MF, + unsigned JTI, MCContext &Ctx) const; + + /// getStackPtrReg - Return the stack pointer register we are using: either + /// ESP or RSP. + unsigned getStackPtrReg() const { return X86StackPtr; } + + /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate + /// function arguments in the caller parameter area. For X86, aggregates + /// that contains are placed at 16-byte boundaries while the rest are at + /// 4-byte boundaries. + virtual unsigned getByValTypeAlignment(Type *Ty) const; + + /// getOptimalMemOpType - Returns the target specific optimal type for load + /// and store operations as a result of memset, memcpy, and memmove + /// lowering. If DstAlign is zero that means it's safe to destination + /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it + /// means there isn't a need to check it against alignment requirement, + /// probably because the source does not need to be loaded. If + /// 'NonScalarIntSafe' is true, that means it's safe to return a + /// non-scalar-integer type, e.g. empty string source, constant, or loaded + /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is + /// constant so it does not need to be loaded. + /// It returns EVT::Other if the type should be determined using generic + /// target-independent logic. + virtual EVT + getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, + bool NonScalarIntSafe, bool MemcpyStrSrc, + MachineFunction &MF) const; + + /// allowsUnalignedMemoryAccesses - Returns true if the target allows + /// unaligned memory accesses. of the specified type. + virtual bool allowsUnalignedMemoryAccesses(EVT VT) const { + return true; + } + + /// LowerOperation - Provide custom lowering hooks for some operations. + /// + virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; + + /// ReplaceNodeResults - Replace the results of node with an illegal result + /// type with new values built out of custom code. + /// + virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, + SelectionDAG &DAG) const; + + + virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + /// isTypeDesirableForOp - Return true if the target has native support for + /// the specified value type and it is 'desirable' to use the type for the + /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 + /// instruction encodings are longer and some i16 instructions are slow. + virtual bool isTypeDesirableForOp(unsigned Opc, EVT VT) const; + + /// isTypeDesirable - Return true if the target has native support for the + /// specified value type and it is 'desirable' to use the type. e.g. On x86 + /// i16 is legal, but undesirable since i16 instruction encodings are longer + /// and some i16 instructions are slow. + virtual bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const; + + virtual MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *MBB) const; + + + /// getTargetNodeName - This method returns the name of a target specific + /// DAG node. + virtual const char *getTargetNodeName(unsigned Opcode) const; + + /// getSetCCResultType - Return the value type to use for ISD::SETCC. + virtual EVT getSetCCResultType(EVT VT) const; + + /// computeMaskedBitsForTargetNode - Determine which of the bits specified + /// in Mask are known to be either zero or one and return them in the + /// KnownZero/KnownOne bitsets. + virtual void computeMaskedBitsForTargetNode(const SDValue Op, + const APInt &Mask, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0) const; + + // ComputeNumSignBitsForTargetNode - Determine the number of bits in the + // operation that are sign bits. + virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op, + unsigned Depth) const; + + virtual bool + isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const; + + SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const; + + virtual bool ExpandInlineAsm(CallInst *CI) const; + + ConstraintType getConstraintType(const std::string &Constraint) const; + + /// Examine constraint string and operand type and determine a weight value. + /// The operand object must already have been set up with the operand type. + virtual ConstraintWeight getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const; + + virtual const char *LowerXConstraint(EVT ConstraintVT) const; + + /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops + /// vector. If it is invalid, don't add anything to Ops. If hasMemory is + /// true it means one of the asm constraint of the inline asm instruction + /// being processed is 'm'. + virtual void LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const; + + /// getRegForInlineAsmConstraint - Given a physical register constraint + /// (e.g. {edx}), return the register number and the register class for the + /// register. This should only be used for C_Register constraints. On + /// error, this returns a register number of 0. + std::pair<unsigned, const TargetRegisterClass*> + getRegForInlineAsmConstraint(const std::string &Constraint, + EVT VT) const; + + /// isLegalAddressingMode - Return true if the addressing mode represented + /// by AM is legal for this target, for a load/store of the specified type. + virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const; + + /// isTruncateFree - Return true if it's free to truncate a value of + /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in + /// register EAX to i16 by referencing its sub-register AX. + virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const; + virtual bool isTruncateFree(EVT VT1, EVT VT2) const; + + /// isZExtFree - Return true if any actual instruction that defines a + /// value of type Ty1 implicit zero-extends the value to Ty2 in the result + /// register. This does not necessarily include registers defined in + /// unknown ways, such as incoming arguments, or copies from unknown + /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this + /// does not necessarily apply to truncate instructions. e.g. on x86-64, + /// all instructions that define 32-bit values implicit zero-extend the + /// result out to 64 bits. + virtual bool isZExtFree(Type *Ty1, Type *Ty2) const; + virtual bool isZExtFree(EVT VT1, EVT VT2) const; + + /// isNarrowingProfitable - Return true if it's profitable to narrow + /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow + /// from i32 to i8 but not from i32 to i16. + virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const; + + /// isFPImmLegal - Returns true if the target can instruction select the + /// specified FP immediate natively. If false, the legalizer will + /// materialize the FP immediate as a load from a constant pool. + virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const; + + /// isShuffleMaskLegal - Targets can use this to indicate that they only + /// support *some* VECTOR_SHUFFLE operations, those with specific masks. + /// By default, if a target supports the VECTOR_SHUFFLE node, all mask + /// values are assumed to be legal. + virtual bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask, + EVT VT) const; + + /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is + /// used by Targets can use this to indicate if there is a suitable + /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant + /// pool entry. + virtual bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, + EVT VT) const; + + /// ShouldShrinkFPConstant - If true, then instruction selection should + /// seek to shrink the FP constant of the specified type to a smaller type + /// in order to save space and / or reduce runtime. + virtual bool ShouldShrinkFPConstant(EVT VT) const { + // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more + // expensive than a straight movsd. On the other hand, it's important to + // shrink long double fp constant since fldt is very slow. + return !X86ScalarSSEf64 || VT == MVT::f80; + } + + const X86Subtarget* getSubtarget() const { + return Subtarget; + } + + /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is + /// computed in an SSE register, not on the X87 floating point stack. + bool isScalarFPTypeInSSEReg(EVT VT) const { + return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 + (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 + } + + /// createFastISel - This method returns a target specific FastISel object, + /// or null if the target does not support "fast" ISel. + virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo) const; + + /// getStackCookieLocation - Return true if the target stores stack + /// protector cookies at a fixed offset in some non-standard address + /// space, and populates the address space and offset as + /// appropriate. + virtual bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const; + + SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, + SelectionDAG &DAG) const; + + protected: + std::pair<const TargetRegisterClass*, uint8_t> + findRepresentativeClass(EVT VT) const; + + private: + /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget *Subtarget; + const X86RegisterInfo *RegInfo; + const TargetData *TD; + + /// X86StackPtr - X86 physical register used as stack ptr. + unsigned X86StackPtr; + + /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 + /// floating point ops. + /// When SSE is available, use it for f32 operations. + /// When SSE2 is available, use it for f64 operations. + bool X86ScalarSSEf32; + bool X86ScalarSSEf64; + + /// LegalFPImmediates - A list of legal fp immediates. + std::vector<APFloat> LegalFPImmediates; + + /// addLegalFPImmediate - Indicate that this x86 target can instruction + /// select the specified FP immediate natively. + void addLegalFPImmediate(const APFloat& Imm) { + LegalFPImmediates.push_back(Imm); + } + + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + SDValue LowerMemArgument(SDValue Chain, + CallingConv::ID CallConv, + const SmallVectorImpl<ISD::InputArg> &ArgInfo, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, MachineFrameInfo *MFI, + unsigned i) const; + SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const; + + // Call lowering helpers. + + /// IsEligibleForTailCallOptimization - Check whether the call is eligible + /// for tail call optimization. Targets which want to do tail call + /// optimization should implement this function. + bool IsEligibleForTailCallOptimization(SDValue Callee, + CallingConv::ID CalleeCC, + bool isVarArg, + bool isCalleeStructRet, + bool isCallerStructRet, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + SelectionDAG& DAG) const; + bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const; + SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, + SDValue Chain, bool IsTailCall, bool Is64Bit, + int FPDiff, DebugLoc dl) const; + + unsigned GetAlignedArgumentStackSize(unsigned StackSize, + SelectionDAG &DAG) const; + + std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, + bool isSigned) const; + + SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, + SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, + int64_t Offset, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const; + SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerToBT(SDValue And, ISD::CondCode CC, + DebugLoc dl, SelectionDAG &DAG) const; + SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + + // Utility functions to help LowerVECTOR_SHUFFLE + SDValue LowerVECTOR_SHUFFLEv8i16(SDValue Op, SelectionDAG &DAG) const; + + virtual SDValue + LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + virtual SDValue + LowerCall(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, bool &isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + + virtual SDValue + LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + DebugLoc dl, SelectionDAG &DAG) const; + + virtual bool isUsedByReturnOnly(SDNode *N) const; + + virtual bool mayBeEmittedAsTailCall(CallInst *CI) const; + + virtual EVT + getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, + ISD::NodeType ExtendKind) const; + + virtual bool + CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const; + + void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG, unsigned NewOp) const; + + /// Utility function to emit string processing sse4.2 instructions + /// that return in xmm0. + /// This takes the instruction to expand, the associated machine basic + /// block, the number of args, and whether or not the second arg is + /// in memory or not. + MachineBasicBlock *EmitPCMP(MachineInstr *BInstr, MachineBasicBlock *BB, + unsigned argNum, bool inMem) const; + + /// Utility functions to emit monitor and mwait instructions. These + /// need to make sure that the arguments to the intrinsic are in the + /// correct registers. + MachineBasicBlock *EmitMonitor(MachineInstr *MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const; + + /// Utility function to emit atomic bitwise operations (and, or, xor). + /// It takes the bitwise instruction to expand, the associated machine basic + /// block, and the associated X86 opcodes for reg/reg and reg/imm. + MachineBasicBlock *EmitAtomicBitwiseWithCustomInserter( + MachineInstr *BInstr, + MachineBasicBlock *BB, + unsigned regOpc, + unsigned immOpc, + unsigned loadOpc, + unsigned cxchgOpc, + unsigned notOpc, + unsigned EAXreg, + TargetRegisterClass *RC, + bool invSrc = false) const; + + MachineBasicBlock *EmitAtomicBit6432WithCustomInserter( + MachineInstr *BInstr, + MachineBasicBlock *BB, + unsigned regOpcL, + unsigned regOpcH, + unsigned immOpcL, + unsigned immOpcH, + bool invSrc = false) const; + + /// Utility function to emit atomic min and max. It takes the min/max + /// instruction to expand, the associated basic block, and the associated + /// cmov opcode for moving the min or max value. + MachineBasicBlock *EmitAtomicMinMaxWithCustomInserter(MachineInstr *BInstr, + MachineBasicBlock *BB, + unsigned cmovOpc) const; + + // Utility function to emit the low-level va_arg code for X86-64. + MachineBasicBlock *EmitVAARG64WithCustomInserter( + MachineInstr *MI, + MachineBasicBlock *MBB) const; + + /// Utility function to emit the xmm reg save portion of va_start. + MachineBasicBlock *EmitVAStartSaveXMMRegsWithCustomInserter( + MachineInstr *BInstr, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredSelect(MachineInstr *I, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI, + MachineBasicBlock *BB, + bool Is64Bit) const; + + MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock *emitLoweredTLSAddr(MachineInstr *MI, + MachineBasicBlock *BB) const; + + /// Emit nodes that will be selected as "test Op0,Op0", or something + /// equivalent, for use with the given x86 condition code. + SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG) const; + + /// Emit nodes that will be selected as "cmp Op0,Op1", or something + /// equivalent, for use with the given x86 condition code. + SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, + SelectionDAG &DAG) const; + }; + + namespace X86 { + FastISel *createFastISel(FunctionLoweringInfo &funcInfo); + } +} + +#endif // X86ISELLOWERING_H diff --git a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td new file mode 100644 index 0000000..dd4f6a5 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td @@ -0,0 +1,102 @@ +//====- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the 3DNow! instruction set, which extends MMX to support +// floating point and also adds a few more random instructions for good measure. +// +//===----------------------------------------------------------------------===// + +class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat> + : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> { +} + +class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> + : I3DNow<o, F, (outs VR64:$dst), ins, + !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>, + Has3DNow0F0FOpcode { + // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet. + let isAsmParserOnly = 1; + let Constraints = "$src1 = $dst"; +} + +class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> + : I3DNow<o, F, (outs VR64:$dst), ins, + !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>, + Has3DNow0F0FOpcode { + // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet. + let isAsmParserOnly = 1; +} + +multiclass I3DNow_binop_rm<bits<8> opc, string Mn> { + def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>; + def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>; +} + +multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> { + def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>; + def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, + (bitconvert (load_mmx addr:$src2))))]>; +} + +multiclass I3DNow_conv_rm<bits<8> opc, string Mn> { + def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>; + def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>; +} + +multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> { + def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>; + def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) + (bitconvert (load_mmx addr:$src))))]>; +} + +defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb">; +defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id">; +defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc">; +defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd">; +defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq">; +defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge">; +defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt">; +defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax">; +defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin">; +defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul">; +defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp">; +defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">; +defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">; +defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">; +defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt">; +defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub">; +defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr">; +defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd">; +defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw">; + + +def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>; + +def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i32mem:$addr), + "prefetch $addr", []>; + +// FIXME: Diassembler gets a bogus decode conflict. +let isAsmParserOnly = 1 in +def PREFETCHW : I3DNow<0x0D, MRM1m, (outs), (ins i16mem:$addr), + "prefetchw $addr", []>; + +// "3DNowA" instructions +defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">; +defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">; +defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">; +defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">; +defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", "a">; diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td new file mode 100644 index 0000000..74b647a --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -0,0 +1,1173 @@ +//===- X86InstrArithmetic.td - Integer Arithmetic Instrs ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the integer arithmetic instructions in the X86 +// architecture. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// LEA - Load Effective Address + +let neverHasSideEffects = 1 in +def LEA16r : I<0x8D, MRMSrcMem, + (outs GR16:$dst), (ins i32mem:$src), + "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize; +let isReMaterializable = 1 in +def LEA32r : I<0x8D, MRMSrcMem, + (outs GR32:$dst), (ins i32mem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", + [(set GR32:$dst, lea32addr:$src)]>, Requires<[In32BitMode]>; + +def LEA64_32r : I<0x8D, MRMSrcMem, + (outs GR32:$dst), (ins lea64_32mem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", + [(set GR32:$dst, lea32addr:$src)]>, Requires<[In64BitMode]>; + +let isReMaterializable = 1 in +def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "lea{q}\t{$src|$dst}, {$dst|$src}", + [(set GR64:$dst, lea64addr:$src)]>; + + + +//===----------------------------------------------------------------------===// +// Fixed-Register Multiplication and Division Instructions. +// + +// Extra precision multiplication + +// AL is really implied by AX, but the registers in Defs must match the +// SDNode results (i8, i32). +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src", + // FIXME: Used for 8-bit mul, ignore result upper 8 bits. + // This probably ought to be moved to a def : Pat<> if the + // syntax can be accepted. + [(set AL, (mul AL, GR8:$src)), + (implicit EFLAGS)]>; // AL,AH = AL*GR8 + +let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in +def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src), + "mul{w}\t$src", + []>, OpSize; // AX,DX = AX*GR16 + +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in +def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src), + "mul{l}\t$src", // EAX,EDX = EAX*GR32 + [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/]>; +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in +def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src), + "mul{q}\t$src", // RAX,RDX = RAX*GR64 + [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/]>; + +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src), + "mul{b}\t$src", + // FIXME: Used for 8-bit mul, ignore result upper 8 bits. + // This probably ought to be moved to a def : Pat<> if the + // syntax can be accepted. + [(set AL, (mul AL, (loadi8 addr:$src))), + (implicit EFLAGS)]>; // AL,AH = AL*[mem8] + +let mayLoad = 1, neverHasSideEffects = 1 in { +let Defs = [AX,DX,EFLAGS], Uses = [AX] in +def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src), + "mul{w}\t$src", + []>, OpSize; // AX,DX = AX*[mem16] + +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in +def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src), + "mul{l}\t$src", + []>; // EAX,EDX = EAX*[mem32] +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in +def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src), + "mul{q}\t$src", []>; // RAX,RDX = RAX*[mem64] +} + +let neverHasSideEffects = 1 in { +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", []>; + // AL,AH = AL*GR8 +let Defs = [AX,DX,EFLAGS], Uses = [AX] in +def IMUL16r : I<0xF7, MRM5r, (outs), (ins GR16:$src), "imul{w}\t$src", []>, + OpSize; // AX,DX = AX*GR16 +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in +def IMUL32r : I<0xF7, MRM5r, (outs), (ins GR32:$src), "imul{l}\t$src", []>; + // EAX,EDX = EAX*GR32 +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in +def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", []>; + // RAX,RDX = RAX*GR64 + +let mayLoad = 1 in { +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def IMUL8m : I<0xF6, MRM5m, (outs), (ins i8mem :$src), + "imul{b}\t$src", []>; // AL,AH = AL*[mem8] +let Defs = [AX,DX,EFLAGS], Uses = [AX] in +def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src), + "imul{w}\t$src", []>, OpSize; // AX,DX = AX*[mem16] +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in +def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src), + "imul{l}\t$src", []>; // EAX,EDX = EAX*[mem32] +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in +def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src), + "imul{q}\t$src", []>; // RAX,RDX = RAX*[mem64] +} +} // neverHasSideEffects + + +let Defs = [EFLAGS] in { +let Constraints = "$src1 = $dst" in { + +let isCommutable = 1 in { // X = IMUL Y, Z --> X = IMUL Z, Y +// Register-Register Signed Integer Multiply +def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2), + "imul{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, GR16:$src2))]>, TB, OpSize; +def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2), + "imul{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, GR32:$src2))]>, TB; +def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "imul{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, GR64:$src2))]>, TB; +} + +// Register-Memory Signed Integer Multiply +def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$src1, i16mem:$src2), + "imul{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, (load addr:$src2)))]>, + TB, OpSize; +def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$src1, i32mem:$src2), + "imul{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, (load addr:$src2)))]>, TB; +def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$src1, i64mem:$src2), + "imul{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, (load addr:$src2)))]>, TB; +} // Constraints = "$src1 = $dst" + +} // Defs = [EFLAGS] + +// Surprisingly enough, these are not two address instructions! +let Defs = [EFLAGS] in { +// Register-Integer Signed Integer Multiply +def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16 + (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, imm:$src2))]>, OpSize; +def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8 + (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, i16immSExt8:$src2))]>, + OpSize; +def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32 + (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, imm:$src2))]>; +def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8 + (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, i32immSExt8:$src2))]>; +def IMUL64rri32 : RIi32<0x69, MRMSrcReg, // GR64 = GR64*I32 + (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>; +def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8 + (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, i64immSExt8:$src2))]>; + + +// Memory-Integer Signed Integer Multiply +def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16 + (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), imm:$src2))]>, + OpSize; +def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8 + (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i16immSExt8:$src2))]>, OpSize; +def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32 + (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), imm:$src2))]>; +def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8 + (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i32immSExt8:$src2))]>; +def IMUL64rmi32 : RIi32<0x69, MRMSrcMem, // GR64 = [mem64]*I32 + (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i64immSExt32:$src2))]>; +def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8 + (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag (load addr:$src1), + i64immSExt8:$src2))]>; +} // Defs = [EFLAGS] + + + + +// unsigned division/remainder +let Defs = [AL,EFLAGS,AX], Uses = [AX] in +def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH + "div{b}\t$src", []>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def DIV16r : I<0xF7, MRM6r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX + "div{w}\t$src", []>, OpSize; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in +def DIV32r : I<0xF7, MRM6r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX + "div{l}\t$src", []>; +// RDX:RAX/r64 = RAX,RDX +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in +def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src), + "div{q}\t$src", []>; + +let mayLoad = 1 in { +let Defs = [AL,EFLAGS,AX], Uses = [AX] in +def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH + "div{b}\t$src", []>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX + "div{w}\t$src", []>, OpSize; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX +def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src), + "div{l}\t$src", []>; +// RDX:RAX/[mem64] = RAX,RDX +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in +def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src), + "div{q}\t$src", []>; +} + +// Signed division/remainder. +let Defs = [AL,EFLAGS,AX], Uses = [AX] in +def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH + "idiv{b}\t$src", []>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def IDIV16r: I<0xF7, MRM7r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX + "idiv{w}\t$src", []>, OpSize; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in +def IDIV32r: I<0xF7, MRM7r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX + "idiv{l}\t$src", []>; +// RDX:RAX/r64 = RAX,RDX +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in +def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src), + "idiv{q}\t$src", []>; + +let mayLoad = 1, mayLoad = 1 in { +let Defs = [AL,EFLAGS,AX], Uses = [AX] in +def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH + "idiv{b}\t$src", []>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX + "idiv{w}\t$src", []>, OpSize; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX +def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src), + "idiv{l}\t$src", []>; +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX +def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src), + "idiv{q}\t$src", []>; +} + +//===----------------------------------------------------------------------===// +// Two address Instructions. +// + +// unary instructions +let CodeSize = 2 in { +let Defs = [EFLAGS] in { +let Constraints = "$src1 = $dst" in { +def NEG8r : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1), + "neg{b}\t$dst", + [(set GR8:$dst, (ineg GR8:$src1)), + (implicit EFLAGS)]>; +def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1), + "neg{w}\t$dst", + [(set GR16:$dst, (ineg GR16:$src1)), + (implicit EFLAGS)]>, OpSize; +def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1), + "neg{l}\t$dst", + [(set GR32:$dst, (ineg GR32:$src1)), + (implicit EFLAGS)]>; +def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst", + [(set GR64:$dst, (ineg GR64:$src1)), + (implicit EFLAGS)]>; +} // Constraints = "$src1 = $dst" + +def NEG8m : I<0xF6, MRM3m, (outs), (ins i8mem :$dst), + "neg{b}\t$dst", + [(store (ineg (loadi8 addr:$dst)), addr:$dst), + (implicit EFLAGS)]>; +def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst), + "neg{w}\t$dst", + [(store (ineg (loadi16 addr:$dst)), addr:$dst), + (implicit EFLAGS)]>, OpSize; +def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst), + "neg{l}\t$dst", + [(store (ineg (loadi32 addr:$dst)), addr:$dst), + (implicit EFLAGS)]>; +def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst", + [(store (ineg (loadi64 addr:$dst)), addr:$dst), + (implicit EFLAGS)]>; +} // Defs = [EFLAGS] + + +// Note: NOT does not set EFLAGS! + +let Constraints = "$src1 = $dst" in { +// Match xor -1 to not. Favors these over a move imm + xor to save code size. +let AddedComplexity = 15 in { +def NOT8r : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1), + "not{b}\t$dst", + [(set GR8:$dst, (not GR8:$src1))]>; +def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1), + "not{w}\t$dst", + [(set GR16:$dst, (not GR16:$src1))]>, OpSize; +def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1), + "not{l}\t$dst", + [(set GR32:$dst, (not GR32:$src1))]>; +def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst", + [(set GR64:$dst, (not GR64:$src1))]>; +} +} // Constraints = "$src1 = $dst" + +def NOT8m : I<0xF6, MRM2m, (outs), (ins i8mem :$dst), + "not{b}\t$dst", + [(store (not (loadi8 addr:$dst)), addr:$dst)]>; +def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst), + "not{w}\t$dst", + [(store (not (loadi16 addr:$dst)), addr:$dst)]>, OpSize; +def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst), + "not{l}\t$dst", + [(store (not (loadi32 addr:$dst)), addr:$dst)]>; +def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst", + [(store (not (loadi64 addr:$dst)), addr:$dst)]>; +} // CodeSize + +// TODO: inc/dec is slow for P4, but fast for Pentium-M. +let Defs = [EFLAGS] in { +let Constraints = "$src1 = $dst" in { +let CodeSize = 2 in +def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), + "inc{b}\t$dst", + [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))]>; + +let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. +def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), + "inc{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))]>, + OpSize, Requires<[In32BitMode]>; +def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + "inc{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))]>, + Requires<[In32BitMode]>; +def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))]>; +} // isConvertibleToThreeAddress = 1, CodeSize = 1 + + +// In 64-bit mode, single byte INC and DEC cannot be encoded. +let isConvertibleToThreeAddress = 1, CodeSize = 2 in { +// Can transform into LEA. +def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), + "inc{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))]>, + OpSize, Requires<[In64BitMode]>; +def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1), + "inc{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))]>, + Requires<[In64BitMode]>; +def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), + "dec{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))]>, + OpSize, Requires<[In64BitMode]>; +def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), + "dec{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))]>, + Requires<[In64BitMode]>; +} // isConvertibleToThreeAddress = 1, CodeSize = 2 + +} // Constraints = "$src1 = $dst" + +let CodeSize = 2 in { + def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst", + [(store (add (loadi8 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)]>; + def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst", + [(store (add (loadi16 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)]>, + OpSize, Requires<[In32BitMode]>; + def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst", + [(store (add (loadi32 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)]>, + Requires<[In32BitMode]>; + def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst", + [(store (add (loadi64 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)]>; + +// These are duplicates of their 32-bit counterparts. Only needed so X86 knows +// how to unfold them. +// FIXME: What is this for?? +def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst", + [(store (add (loadi16 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)]>, + OpSize, Requires<[In64BitMode]>; +def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst", + [(store (add (loadi32 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)]>, + Requires<[In64BitMode]>; +def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst", + [(store (add (loadi16 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)]>, + OpSize, Requires<[In64BitMode]>; +def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst", + [(store (add (loadi32 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)]>, + Requires<[In64BitMode]>; +} // CodeSize = 2 + +let Constraints = "$src1 = $dst" in { +let CodeSize = 2 in +def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), + "dec{b}\t$dst", + [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))]>; +let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. +def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), + "dec{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))]>, + OpSize, Requires<[In32BitMode]>; +def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + "dec{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))]>, + Requires<[In32BitMode]>; +def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))]>; +} // CodeSize = 2 +} // Constraints = "$src1 = $dst" + + +let CodeSize = 2 in { + def DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst", + [(store (add (loadi8 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)]>; + def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst", + [(store (add (loadi16 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)]>, + OpSize, Requires<[In32BitMode]>; + def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst", + [(store (add (loadi32 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)]>, + Requires<[In32BitMode]>; + def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", + [(store (add (loadi64 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)]>; +} // CodeSize = 2 +} // Defs = [EFLAGS] + + +/// X86TypeInfo - This is a bunch of information that describes relevant X86 +/// information about value types. For example, it can tell you what the +/// register class and preferred load to use. +class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass, + PatFrag loadnode, X86MemOperand memoperand, ImmType immkind, + Operand immoperand, SDPatternOperator immoperator, + Operand imm8operand, SDPatternOperator imm8operator, + bit hasOddOpcode, bit hasOpSizePrefix, bit hasREX_WPrefix> { + /// VT - This is the value type itself. + ValueType VT = vt; + + /// InstrSuffix - This is the suffix used on instructions with this type. For + /// example, i8 -> "b", i16 -> "w", i32 -> "l", i64 -> "q". + string InstrSuffix = instrsuffix; + + /// RegClass - This is the register class associated with this type. For + /// example, i8 -> GR8, i16 -> GR16, i32 -> GR32, i64 -> GR64. + RegisterClass RegClass = regclass; + + /// LoadNode - This is the load node associated with this type. For + /// example, i8 -> loadi8, i16 -> loadi16, i32 -> loadi32, i64 -> loadi64. + PatFrag LoadNode = loadnode; + + /// MemOperand - This is the memory operand associated with this type. For + /// example, i8 -> i8mem, i16 -> i16mem, i32 -> i32mem, i64 -> i64mem. + X86MemOperand MemOperand = memoperand; + + /// ImmEncoding - This is the encoding of an immediate of this type. For + /// example, i8 -> Imm8, i16 -> Imm16, i32 -> Imm32. Note that i64 -> Imm32 + /// since the immediate fields of i64 instructions is a 32-bit sign extended + /// value. + ImmType ImmEncoding = immkind; + + /// ImmOperand - This is the operand kind of an immediate of this type. For + /// example, i8 -> i8imm, i16 -> i16imm, i32 -> i32imm. Note that i64 -> + /// i64i32imm since the immediate fields of i64 instructions is a 32-bit sign + /// extended value. + Operand ImmOperand = immoperand; + + /// ImmOperator - This is the operator that should be used to match an + /// immediate of this kind in a pattern (e.g. imm, or i64immSExt32). + SDPatternOperator ImmOperator = immoperator; + + /// Imm8Operand - This is the operand kind to use for an imm8 of this type. + /// For example, i8 -> <invalid>, i16 -> i16i8imm, i32 -> i32i8imm. This is + /// only used for instructions that have a sign-extended imm8 field form. + Operand Imm8Operand = imm8operand; + + /// Imm8Operator - This is the operator that should be used to match an 8-bit + /// sign extended immediate of this kind in a pattern (e.g. imm16immSExt8). + SDPatternOperator Imm8Operator = imm8operator; + + /// HasOddOpcode - This bit is true if the instruction should have an odd (as + /// opposed to even) opcode. Operations on i8 are usually even, operations on + /// other datatypes are odd. + bit HasOddOpcode = hasOddOpcode; + + /// HasOpSizePrefix - This bit is set to true if the instruction should have + /// the 0x66 operand size prefix. This is set for i16 types. + bit HasOpSizePrefix = hasOpSizePrefix; + + /// HasREX_WPrefix - This bit is set to true if the instruction should have + /// the 0x40 REX prefix. This is set for i64 types. + bit HasREX_WPrefix = hasREX_WPrefix; +} + +def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">; + + +def Xi8 : X86TypeInfo<i8 , "b", GR8 , loadi8 , i8mem , + Imm8 , i8imm , imm, i8imm , invalid_node, + 0, 0, 0>; +def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem, + Imm16, i16imm, imm, i16i8imm, i16immSExt8, + 1, 1, 0>; +def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem, + Imm32, i32imm, imm, i32i8imm, i32immSExt8, + 1, 0, 0>; +def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem, + Imm32, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8, + 1, 0, 1>; + +/// ITy - This instruction base class takes the type info for the instruction. +/// Using this, it: +/// 1. Concatenates together the instruction mnemonic with the appropriate +/// suffix letter, a tab, and the arguments. +/// 2. Infers whether the instruction should have a 0x66 prefix byte. +/// 3. Infers whether the instruction should have a 0x40 REX_W prefix. +/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations) +/// or 1 (for i16,i32,i64 operations). +class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins, + string mnemonic, string args, list<dag> pattern> + : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4}, + opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode }, + f, outs, ins, + !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern> { + + // Infer instruction prefixes from type info. + let hasOpSizePrefix = typeinfo.HasOpSizePrefix; + let hasREX_WPrefix = typeinfo.HasREX_WPrefix; +} + +// BinOpRR - Instructions like "add reg, reg, reg". +class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + dag outlist, list<dag> pattern, Format f = MRMDestReg> + : ITy<opcode, f, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern>; + +// BinOpRR_R - Instructions like "add reg, reg, reg", where the pattern has +// just a regclass (no eflags) as a result. +class BinOpRR_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>; + +// BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has +// just a EFLAGS as a result. +class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f = MRMDestReg> + : BinOpRR<opcode, mnemonic, typeinfo, (outs), + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))], + f>; + +// BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has +// both a regclass and EFLAGS as a result. +class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>; + +// BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has +// both a regclass and EFLAGS as a result, and has EFLAGS as input. +class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2, + EFLAGS))]>; + +// BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding). +class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> + : ITy<opcode, MRMSrcReg, typeinfo, + (outs typeinfo.RegClass:$dst), + (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), + mnemonic, "{$src2, $dst|$dst, $src2}", []> { + // The disassembler should know about this, but not the asmparser. + let isCodeGenOnly = 1; +} + +// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding). +class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> + : ITy<opcode, MRMSrcReg, typeinfo, (outs), + (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", []> { + // The disassembler should know about this, but not the asmparser. + let isCodeGenOnly = 1; +} + +// BinOpRM - Instructions like "add reg, reg, [mem]". +class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + dag outlist, list<dag> pattern> + : ITy<opcode, MRMSrcMem, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern>; + +// BinOpRM_R - Instructions like "add reg, reg, [mem]". +class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>; + +// BinOpRM_F - Instructions like "cmp reg, [mem]". +class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs), + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>; + +// BinOpRM_RF - Instructions like "add reg, reg, [mem]". +class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>; + +// BinOpRM_RFF - Instructions like "adc reg, reg, [mem]". +class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2), + EFLAGS))]>; + +// BinOpRI - Instructions like "add reg, reg, imm". +class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Format f, dag outlist, list<dag> pattern> + : ITy<opcode, f, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern> { + let ImmT = typeinfo.ImmEncoding; +} + +// BinOpRI_R - Instructions like "add reg, reg, imm". +class BinOpRI_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, + (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>; + +// BinOpRI_F - Instructions like "cmp reg, imm". +class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpRI<opcode, mnemonic, typeinfo, f, (outs), + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>; + +// BinOpRI_RF - Instructions like "add reg, reg, imm". +class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>; + +// BinOpRI_RFF - Instructions like "adc reg, reg, imm". +class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2, + EFLAGS))]>; + +// BinOpRI8 - Instructions like "add reg, reg, imm8". +class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Format f, dag outlist, list<dag> pattern> + : ITy<opcode, f, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern> { + let ImmT = Imm8; // Always 8-bit immediate. +} + +// BinOpRI8_R - Instructions like "add reg, reg, imm8". +class BinOpRI8_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, + (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; + +// BinOpRI8_F - Instructions like "cmp reg, imm8". +class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs), + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; + +// BinOpRI8_RF - Instructions like "add reg, reg, imm8". +class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; + +// BinOpRI8_RFF - Instructions like "adc reg, reg, imm8". +class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2, + EFLAGS))]>; + +// BinOpMR - Instructions like "add [mem], reg". +class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + list<dag> pattern> + : ITy<opcode, MRMDestMem, typeinfo, + (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src), + mnemonic, "{$src, $dst|$dst, $src}", pattern>; + +// BinOpMR_RMW - Instructions like "add [mem], reg". +class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpMR<opcode, mnemonic, typeinfo, + [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst), + (implicit EFLAGS)]>; + +// BinOpMR_RMW_FF - Instructions like "adc [mem], reg". +class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpMR<opcode, mnemonic, typeinfo, + [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS), + addr:$dst), + (implicit EFLAGS)]>; + +// BinOpMR_F - Instructions like "cmp [mem], reg". +class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpMR<opcode, mnemonic, typeinfo, + [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>; + +// BinOpMI - Instructions like "add [mem], imm". +class BinOpMI<string mnemonic, X86TypeInfo typeinfo, + Format f, list<dag> pattern, bits<8> opcode = 0x80> + : ITy<opcode, f, typeinfo, + (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src), + mnemonic, "{$src, $dst|$dst, $src}", pattern> { + let ImmT = typeinfo.ImmEncoding; +} + +// BinOpMI_RMW - Instructions like "add [mem], imm". +class BinOpMI_RMW<string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI<mnemonic, typeinfo, f, + [(store (opnode (typeinfo.VT (load addr:$dst)), + typeinfo.ImmOperator:$src), addr:$dst), + (implicit EFLAGS)]>; + +// BinOpMI_RMW_FF - Instructions like "adc [mem], imm". +class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI<mnemonic, typeinfo, f, + [(store (opnode (typeinfo.VT (load addr:$dst)), + typeinfo.ImmOperator:$src, EFLAGS), addr:$dst), + (implicit EFLAGS)]>; + +// BinOpMI_F - Instructions like "cmp [mem], imm". +class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f, bits<8> opcode = 0x80> + : BinOpMI<mnemonic, typeinfo, f, + [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)), + typeinfo.ImmOperator:$src))], + opcode>; + +// BinOpMI8 - Instructions like "add [mem], imm8". +class BinOpMI8<string mnemonic, X86TypeInfo typeinfo, + Format f, list<dag> pattern> + : ITy<0x82, f, typeinfo, + (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src), + mnemonic, "{$src, $dst|$dst, $src}", pattern> { + let ImmT = Imm8; // Always 8-bit immediate. +} + +// BinOpMI8_RMW - Instructions like "add [mem], imm8". +class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI8<mnemonic, typeinfo, f, + [(store (opnode (load addr:$dst), + typeinfo.Imm8Operator:$src), addr:$dst), + (implicit EFLAGS)]>; + +// BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8". +class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI8<mnemonic, typeinfo, f, + [(store (opnode (load addr:$dst), + typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst), + (implicit EFLAGS)]>; + +// BinOpMI8_F - Instructions like "cmp [mem], imm8". +class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI8<mnemonic, typeinfo, f, + [(set EFLAGS, (opnode (load addr:$dst), + typeinfo.Imm8Operator:$src))]>; + +// BinOpAI - Instructions like "add %eax, %eax, imm". +class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands> + : ITy<opcode, RawFrm, typeinfo, + (outs), (ins typeinfo.ImmOperand:$src), + mnemonic, operands, []> { + let ImmT = typeinfo.ImmEncoding; + let Uses = [areg]; + let Defs = [areg]; +} + +/// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is +/// defined with "(set GPR:$dst, EFLAGS, (...". +/// +/// It would be nice to get rid of the second and third argument here, but +/// tblgen can't handle dependent type references aggressively enough: PR8330 +multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, + string mnemonic, Format RegMRM, Format MemMRM, + SDNode opnodeflag, SDNode opnode, + bit CommutableRR, bit ConvertibleToThreeAddress> { + let Defs = [EFLAGS] in { + let Constraints = "$src1 = $dst" in { + let isCommutable = CommutableRR, + isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def #NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>; + def #NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>; + def #NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>; + def #NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>; + } // isCommutable + + def #NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>; + def #NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>; + def #NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>; + def #NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>; + + def #NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>; + def #NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>; + def #NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>; + def #NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>; + + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + // NOTE: These are order specific, we want the ri8 forms to be listed + // first so that they are slightly preferred to the ri forms. + def #NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>; + def #NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>; + def #NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>; + + def #NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; + def #NAME#16ri : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>; + def #NAME#32ri : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>; + def #NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>; + } + } // Constraints = "$src1 = $dst" + + def #NAME#8mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>; + def #NAME#16mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>; + def #NAME#32mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>; + def #NAME#64mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>; + + // NOTE: These are order specific, we want the mi8 forms to be listed + // first so that they are slightly preferred to the mi forms. + def #NAME#16mi8 : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>; + def #NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>; + def #NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>; + + def #NAME#8mi : BinOpMI_RMW<mnemonic, Xi8 , opnode, MemMRM>; + def #NAME#16mi : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>; + def #NAME#32mi : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>; + def #NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>; + + def #NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|AL, $src}">; + def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|AX, $src}">; + def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|EAX, $src}">; + def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|RAX, $src}">; + } +} + +/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is +/// defined with "(set GPR:$dst, EFLAGS, (node LHS, RHS, EFLAGS))" like ADC and +/// SBB. +/// +/// It would be nice to get rid of the second and third argument here, but +/// tblgen can't handle dependent type references aggressively enough: PR8330 +multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, + string mnemonic, Format RegMRM, Format MemMRM, + SDNode opnode, bit CommutableRR, + bit ConvertibleToThreeAddress> { + let Defs = [EFLAGS] in { + let Constraints = "$src1 = $dst" in { + let isCommutable = CommutableRR, + isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def #NAME#8rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>; + def #NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>; + def #NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>; + def #NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>; + } // isCommutable + + def #NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>; + def #NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>; + def #NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>; + def #NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>; + + def #NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>; + def #NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>; + def #NAME#32rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>; + def #NAME#64rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>; + + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + // NOTE: These are order specific, we want the ri8 forms to be listed + // first so that they are slightly preferred to the ri forms. + def #NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>; + def #NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>; + def #NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>; + + def #NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>; + def #NAME#16ri : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>; + def #NAME#32ri : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>; + def #NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>; + } + } // Constraints = "$src1 = $dst" + + def #NAME#8mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>; + def #NAME#16mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>; + def #NAME#32mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>; + def #NAME#64mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>; + + // NOTE: These are order specific, we want the mi8 forms to be listed + // first so that they are slightly preferred to the mi forms. + def #NAME#16mi8 : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; + def #NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; + def #NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; + + def #NAME#8mi : BinOpMI_RMW_FF<mnemonic, Xi8 , opnode, MemMRM>; + def #NAME#16mi : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; + def #NAME#32mi : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; + def #NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; + + def #NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|AL, $src}">; + def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|AX, $src}">; + def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|EAX, $src}">; + def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|RAX, $src}">; + } +} + +/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is +/// defined with "(set EFLAGS, (...". It would be really nice to find a way +/// to factor this with the other ArithBinOp_*. +/// +multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, + string mnemonic, Format RegMRM, Format MemMRM, + SDNode opnode, + bit CommutableRR, bit ConvertibleToThreeAddress> { + let Defs = [EFLAGS] in { + let isCommutable = CommutableRR, + isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def #NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>; + def #NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>; + def #NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>; + def #NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>; + } // isCommutable + + def #NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>; + def #NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>; + def #NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>; + def #NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>; + + def #NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>; + def #NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>; + def #NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>; + def #NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>; + + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + // NOTE: These are order specific, we want the ri8 forms to be listed + // first so that they are slightly preferred to the ri forms. + def #NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>; + def #NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>; + def #NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>; + + def #NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>; + def #NAME#16ri : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>; + def #NAME#32ri : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>; + def #NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>; + } + + def #NAME#8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>; + def #NAME#16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>; + def #NAME#32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>; + def #NAME#64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>; + + // NOTE: These are order specific, we want the mi8 forms to be listed + // first so that they are slightly preferred to the mi forms. + def #NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>; + def #NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>; + def #NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>; + + def #NAME#8mi : BinOpMI_F<mnemonic, Xi8 , opnode, MemMRM>; + def #NAME#16mi : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>; + def #NAME#32mi : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>; + def #NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>; + + def #NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|AL, $src}">; + def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|AX, $src}">; + def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|EAX, $src}">; + def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|RAX, $src}">; + } +} + + +defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m, + X86and_flag, and, 1, 0>; +defm OR : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m, + X86or_flag, or, 1, 0>; +defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m, + X86xor_flag, xor, 1, 0>; +defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m, + X86add_flag, add, 1, 1>; +defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m, + X86sub_flag, sub, 0, 0>; + +// Arithmetic. +let Uses = [EFLAGS] in { + defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag, + 1, 0>; + defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag, + 0, 0>; +} + +defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>; + + +//===----------------------------------------------------------------------===// +// Semantically, test instructions are similar like AND, except they don't +// generate a result. From an encoding perspective, they are very different: +// they don't have all the usual imm8 and REV forms, and are encoded into a +// different space. +def X86testpat : PatFrag<(ops node:$lhs, node:$rhs), + (X86cmp (and_su node:$lhs, node:$rhs), 0)>; + +let Defs = [EFLAGS] in { + let isCommutable = 1 in { + def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>; + def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>; + def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>; + def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>; + } // isCommutable + + def TEST8rm : BinOpRM_F<0x84, "test", Xi8 , X86testpat>; + def TEST16rm : BinOpRM_F<0x84, "test", Xi16, X86testpat>; + def TEST32rm : BinOpRM_F<0x84, "test", Xi32, X86testpat>; + def TEST64rm : BinOpRM_F<0x84, "test", Xi64, X86testpat>; + + def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>; + def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>; + def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>; + def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>; + + def TEST8mi : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>; + def TEST16mi : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>; + def TEST32mi : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>; + def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>; + + def TEST8i8 : BinOpAI<0xA8, "test", Xi8 , AL, + "{$src, %al|AL, $src}">; + def TEST16i16 : BinOpAI<0xA8, "test", Xi16, AX, + "{$src, %ax|AX, $src}">; + def TEST32i32 : BinOpAI<0xA8, "test", Xi32, EAX, + "{$src, %eax|EAX, $src}">; + def TEST64i32 : BinOpAI<0xA8, "test", Xi64, RAX, + "{$src, %rax|RAX, $src}">; + + // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the + // register class is constrained to GR8_NOREX. + let isPseudo = 1 in + def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask), + "", []>; +} + +//===----------------------------------------------------------------------===// +// ANDN Instruction +// +multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop, + PatFrag ld_frag> { + def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, EFLAGS, (X86andn_flag RC:$src1, RC:$src2))]>; + def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, EFLAGS, + (X86andn_flag RC:$src1, (ld_frag addr:$src2)))]>; +} + +let Predicates = [HasBMI], Defs = [EFLAGS] in { + defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8, VEX_4V; + defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8, VEX_4V, VEX_W; +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h new file mode 100644 index 0000000..0245e5c --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h @@ -0,0 +1,184 @@ +//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file exposes functions that may be used with BuildMI from the +// MachineInstrBuilder.h file to handle X86'isms in a clean way. +// +// The BuildMem function may be used with the BuildMI function to add entire +// memory references in a single, typed, function call. X86 memory references +// can be very complex expressions (described in the README), so wrapping them +// up behind an easier to use interface makes sense. Descriptions of the +// functions are included below. +// +// For reference, the order of operands for memory references is: +// (Operand), Base, Scale, Index, Displacement. +// +//===----------------------------------------------------------------------===// + +#ifndef X86INSTRBUILDER_H +#define X86INSTRBUILDER_H + +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/PseudoSourceValue.h" + +namespace llvm { + +/// X86AddressMode - This struct holds a generalized full x86 address mode. +/// The base register can be a frame index, which will eventually be replaced +/// with BP or SP and Disp being offsetted accordingly. The displacement may +/// also include the offset of a global value. +struct X86AddressMode { + enum { + RegBase, + FrameIndexBase + } BaseType; + + union { + unsigned Reg; + int FrameIndex; + } Base; + + unsigned Scale; + unsigned IndexReg; + int Disp; + const GlobalValue *GV; + unsigned GVOpFlags; + + X86AddressMode() + : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0), GVOpFlags(0) { + Base.Reg = 0; + } + + + void getFullAddress(SmallVectorImpl<MachineOperand> &MO) { + assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8); + + if (BaseType == X86AddressMode::RegBase) + MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false, + false, false, false, 0, false)); + else { + assert(BaseType == X86AddressMode::FrameIndexBase); + MO.push_back(MachineOperand::CreateFI(Base.FrameIndex)); + } + + MO.push_back(MachineOperand::CreateImm(Scale)); + MO.push_back(MachineOperand::CreateReg(IndexReg, false, false, + false, false, false, 0, false)); + + if (GV) + MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags)); + else + MO.push_back(MachineOperand::CreateImm(Disp)); + + MO.push_back(MachineOperand::CreateReg(0, false, false, + false, false, false, 0, false)); + } +}; + +/// addDirectMem - This function is used to add a direct memory reference to the +/// current instruction -- that is, a dereference of an address in a register, +/// with no scale, index or displacement. An example is: DWORD PTR [EAX]. +/// +static inline const MachineInstrBuilder & +addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) { + // Because memory references are always represented with five + // values, this adds: Reg, 1, NoReg, 0, NoReg to the instruction. + return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0).addReg(0); +} + + +static inline const MachineInstrBuilder & +addOffset(const MachineInstrBuilder &MIB, int Offset) { + return MIB.addImm(1).addReg(0).addImm(Offset).addReg(0); +} + +/// addRegOffset - This function is used to add a memory reference of the form +/// [Reg + Offset], i.e., one with no scale or index, but with a +/// displacement. An example is: DWORD PTR [EAX + 4]. +/// +static inline const MachineInstrBuilder & +addRegOffset(const MachineInstrBuilder &MIB, + unsigned Reg, bool isKill, int Offset) { + return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset); +} + +/// addRegReg - This function is used to add a memory reference of the form: +/// [Reg + Reg]. +static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB, + unsigned Reg1, bool isKill1, + unsigned Reg2, bool isKill2) { + return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1) + .addReg(Reg2, getKillRegState(isKill2)).addImm(0).addReg(0); +} + +static inline const MachineInstrBuilder & +addFullAddress(const MachineInstrBuilder &MIB, + const X86AddressMode &AM) { + assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8); + + if (AM.BaseType == X86AddressMode::RegBase) + MIB.addReg(AM.Base.Reg); + else { + assert(AM.BaseType == X86AddressMode::FrameIndexBase); + MIB.addFrameIndex(AM.Base.FrameIndex); + } + + MIB.addImm(AM.Scale).addReg(AM.IndexReg); + if (AM.GV) + MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags); + else + MIB.addImm(AM.Disp); + + return MIB.addReg(0); +} + +/// addFrameReference - This function is used to add a reference to the base of +/// an abstract object on the stack frame of the current function. This +/// reference has base register as the FrameIndex offset until it is resolved. +/// This allows a constant offset to be specified as well... +/// +static inline const MachineInstrBuilder & +addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) { + MachineInstr *MI = MIB; + MachineFunction &MF = *MI->getParent()->getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + const MCInstrDesc &MCID = MI->getDesc(); + unsigned Flags = 0; + if (MCID.mayLoad()) + Flags |= MachineMemOperand::MOLoad; + if (MCID.mayStore()) + Flags |= MachineMemOperand::MOStore; + MachineMemOperand *MMO = + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI, Offset), + Flags, MFI.getObjectSize(FI), + MFI.getObjectAlignment(FI)); + return addOffset(MIB.addFrameIndex(FI), Offset) + .addMemOperand(MMO); +} + +/// addConstantPoolReference - This function is used to add a reference to the +/// base of a constant value spilled to the per-function constant pool. The +/// reference uses the abstract ConstantPoolIndex which is retained until +/// either machine code emission or assembly output. In PIC mode on x86-32, +/// the GlobalBaseReg parameter can be used to make this a +/// GlobalBaseReg-relative reference. +/// +static inline const MachineInstrBuilder & +addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI, + unsigned GlobalBaseReg, unsigned char OpFlags) { + //FIXME: factor this + return MIB.addReg(GlobalBaseReg).addImm(1).addReg(0) + .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0); +} + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td new file mode 100644 index 0000000..3a43b22 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -0,0 +1,104 @@ +//===- X86InstrCMovSetCC.td - Conditional Move and SetCC ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 conditional move and set on condition +// instructions. +// +//===----------------------------------------------------------------------===// + + +// SetCC instructions. +multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> { + let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", + isCommutable = 1 in { + def #NAME#16rr + : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), + [(set GR16:$dst, + (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))]>,TB,OpSize; + def #NAME#32rr + : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"), + [(set GR32:$dst, + (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))]>, TB; + def #NAME#64rr + :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"), + [(set GR64:$dst, + (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))]>, TB; + } + + let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" in { + def #NAME#16rm + : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), + !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + CondNode, EFLAGS))]>, TB, OpSize; + def #NAME#32rm + : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), + !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"), + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + CondNode, EFLAGS))]>, TB; + def #NAME#64rm + :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), + !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"), + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + CondNode, EFLAGS))]>, TB; + } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" +} // end multiclass + + +// Conditional Moves. +defm CMOVO : CMOV<0x40, "cmovo" , X86_COND_O>; +defm CMOVNO : CMOV<0x41, "cmovno", X86_COND_NO>; +defm CMOVB : CMOV<0x42, "cmovb" , X86_COND_B>; +defm CMOVAE : CMOV<0x43, "cmovae", X86_COND_AE>; +defm CMOVE : CMOV<0x44, "cmove" , X86_COND_E>; +defm CMOVNE : CMOV<0x45, "cmovne", X86_COND_NE>; +defm CMOVBE : CMOV<0x46, "cmovbe", X86_COND_BE>; +defm CMOVA : CMOV<0x47, "cmova" , X86_COND_A>; +defm CMOVS : CMOV<0x48, "cmovs" , X86_COND_S>; +defm CMOVNS : CMOV<0x49, "cmovns", X86_COND_NS>; +defm CMOVP : CMOV<0x4A, "cmovp" , X86_COND_P>; +defm CMOVNP : CMOV<0x4B, "cmovnp", X86_COND_NP>; +defm CMOVL : CMOV<0x4C, "cmovl" , X86_COND_L>; +defm CMOVGE : CMOV<0x4D, "cmovge", X86_COND_GE>; +defm CMOVLE : CMOV<0x4E, "cmovle", X86_COND_LE>; +defm CMOVG : CMOV<0x4F, "cmovg" , X86_COND_G>; + + +// SetCC instructions. +multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> { + let Uses = [EFLAGS] in { + def r : I<opc, MRM0r, (outs GR8:$dst), (ins), + !strconcat(Mnemonic, "\t$dst"), + [(set GR8:$dst, (X86setcc OpNode, EFLAGS))]>, TB; + def m : I<opc, MRM0m, (outs), (ins i8mem:$dst), + !strconcat(Mnemonic, "\t$dst"), + [(store (X86setcc OpNode, EFLAGS), addr:$dst)]>, TB; + } // Uses = [EFLAGS] +} + +defm SETO : SETCC<0x90, "seto", X86_COND_O>; // is overflow bit set +defm SETNO : SETCC<0x91, "setno", X86_COND_NO>; // is overflow bit not set +defm SETB : SETCC<0x92, "setb", X86_COND_B>; // unsigned less than +defm SETAE : SETCC<0x93, "setae", X86_COND_AE>; // unsigned greater or equal +defm SETE : SETCC<0x94, "sete", X86_COND_E>; // equal to +defm SETNE : SETCC<0x95, "setne", X86_COND_NE>; // not equal to +defm SETBE : SETCC<0x96, "setbe", X86_COND_BE>; // unsigned less than or equal +defm SETA : SETCC<0x97, "seta", X86_COND_A>; // unsigned greater than +defm SETS : SETCC<0x98, "sets", X86_COND_S>; // is signed bit set +defm SETNS : SETCC<0x99, "setns", X86_COND_NS>; // is not signed +defm SETP : SETCC<0x9A, "setp", X86_COND_P>; // is parity bit set +defm SETNP : SETCC<0x9B, "setnp", X86_COND_NP>; // is parity bit not set +defm SETL : SETCC<0x9C, "setl", X86_COND_L>; // signed less than +defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>; // signed greater or equal +defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>; // signed less than or equal +defm SETG : SETCC<0x9F, "setg", X86_COND_G>; // signed greater than + diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td new file mode 100644 index 0000000..da28690 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td @@ -0,0 +1,1737 @@ +//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the various pseudo instructions used by the compiler, +// as well as Pat patterns used during instruction selection. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Pattern Matching Support + +def GetLo32XForm : SDNodeXForm<imm, [{ + // Transformation function: get the low 32 bits. + return getI32Imm((unsigned)N->getZExtValue()); +}]>; + +def GetLo8XForm : SDNodeXForm<imm, [{ + // Transformation function: get the low 8 bits. + return getI8Imm((uint8_t)N->getZExtValue()); +}]>; + + +//===----------------------------------------------------------------------===// +// Random Pseudo Instructions. + +// PIC base construction. This expands to code that looks like this: +// call $next_inst +// popl %destreg" +let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in + def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label), + "", []>; + + +// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into +// a stack adjustment and the codegen must know that they may modify the stack +// pointer before prolog-epilog rewriting occurs. +// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become +// sub / add which can clobber EFLAGS. +let Defs = [ESP, EFLAGS], Uses = [ESP] in { +def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt), + "#ADJCALLSTACKDOWN", + [(X86callseq_start timm:$amt)]>, + Requires<[In32BitMode]>; +def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKUP", + [(X86callseq_end timm:$amt1, timm:$amt2)]>, + Requires<[In32BitMode]>; +} + +// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into +// a stack adjustment and the codegen must know that they may modify the stack +// pointer before prolog-epilog rewriting occurs. +// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become +// sub / add which can clobber EFLAGS. +let Defs = [RSP, EFLAGS], Uses = [RSP] in { +def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt), + "#ADJCALLSTACKDOWN", + [(X86callseq_start timm:$amt)]>, + Requires<[In64BitMode]>; +def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKUP", + [(X86callseq_end timm:$amt1, timm:$amt2)]>, + Requires<[In64BitMode]>; +} + + + +// x86-64 va_start lowering magic. +let usesCustomInserter = 1 in { +def VASTART_SAVE_XMM_REGS : I<0, Pseudo, + (outs), + (ins GR8:$al, + i64imm:$regsavefi, i64imm:$offset, + variable_ops), + "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset", + [(X86vastart_save_xmm_regs GR8:$al, + imm:$regsavefi, + imm:$offset)]>; + +// The VAARG_64 pseudo-instruction takes the address of the va_list, +// and places the address of the next argument into a register. +let Defs = [EFLAGS] in +def VAARG_64 : I<0, Pseudo, + (outs GR64:$dst), + (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align), + "#VAARG_64 $dst, $ap, $size, $mode, $align", + [(set GR64:$dst, + (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)), + (implicit EFLAGS)]>; + +// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows +// targets. These calls are needed to probe the stack when allocating more than +// 4k bytes in one go. Touching the stack at 4K increments is necessary to +// ensure that the guard pages used by the OS virtual memory manager are +// allocated in correct sequence. +// The main point of having separate instruction are extra unmodelled effects +// (compared to ordinary calls) like stack pointer change. + +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in + def WIN_ALLOCA : I<0, Pseudo, (outs), (ins), + "# dynamic stack allocation", + [(X86WinAlloca)]>; + +// When using segmented stacks these are lowered into instructions which first +// check if the current stacklet has enough free memory. If it does, memory is +// allocated by bumping the stack pointer. Otherwise memory is allocated from +// the heap. + +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP, EAX] in +def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size), + "# variable sized alloca for segmented stacks", + [(set GR32:$dst, + (X86SegAlloca GR32:$size))]>, + Requires<[In32BitMode]>; + +let Defs = [RAX, RSP, EFLAGS], Uses = [RSP, RAX] in +def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), + "# variable sized alloca for segmented stacks", + [(set GR64:$dst, + (X86SegAlloca GR64:$size))]>, + Requires<[In64BitMode]>; + +} + + + +//===----------------------------------------------------------------------===// +// EH Pseudo Instructions +// +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, isCodeGenOnly = 1 in { +def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr), + "ret\t#eh_return, addr: $addr", + [(X86ehret GR32:$addr)]>; + +} + +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, isCodeGenOnly = 1 in { +def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr), + "ret\t#eh_return, addr: $addr", + [(X86ehret GR64:$addr)]>; + +} + +//===----------------------------------------------------------------------===// +// Alias Instructions +//===----------------------------------------------------------------------===// + +// Alias instructions that map movr0 to xor. +// FIXME: remove when we can teach regalloc that xor reg, reg is ok. +// FIXME: Set encoding to pseudo. +let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, + isCodeGenOnly = 1 in { +def MOV8r0 : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins), "", + [(set GR8:$dst, 0)]>; + +// We want to rewrite MOV16r0 in terms of MOV32r0, because it's a smaller +// encoding and avoids a partial-register update sometimes, but doing so +// at isel time interferes with rematerialization in the current register +// allocator. For now, this is rewritten when the instruction is lowered +// to an MCInst. +def MOV16r0 : I<0x31, MRMInitReg, (outs GR16:$dst), (ins), + "", + [(set GR16:$dst, 0)]>, OpSize; + +// FIXME: Set encoding to pseudo. +def MOV32r0 : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, 0)]>; +} + +// We want to rewrite MOV64r0 in terms of MOV32r0, because it's sometimes a +// smaller encoding, but doing so at isel time interferes with rematerialization +// in the current register allocator. For now, this is rewritten when the +// instruction is lowered to an MCInst. +// FIXME: AddedComplexity gives this a higher priority than MOV64ri32. Remove +// when we have a better way to specify isel priority. +let Defs = [EFLAGS], isCodeGenOnly=1, + AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in +def MOV64r0 : I<0x31, MRMInitReg, (outs GR64:$dst), (ins), "", + [(set GR64:$dst, 0)]>; + +// Materialize i64 constant where top 32-bits are zero. This could theoretically +// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however +// that would make it more difficult to rematerialize. +let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1, + isCodeGenOnly = 1 in +def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src), + "", [(set GR64:$dst, i64immZExt32:$src)]>; + +// Use sbb to materialize carry bit. +let Uses = [EFLAGS], Defs = [EFLAGS], isCodeGenOnly = 1 in { +// FIXME: These are pseudo ops that should be replaced with Pat<> patterns. +// However, Pat<> can't replicate the destination reg into the inputs of the +// result. +// FIXME: Change these to have encoding Pseudo when X86MCCodeEmitter replaces +// X86CodeEmitter. +def SETB_C8r : I<0x18, MRMInitReg, (outs GR8:$dst), (ins), "", + [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C16r : I<0x19, MRMInitReg, (outs GR16:$dst), (ins), "", + [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>, + OpSize; +def SETB_C32r : I<0x19, MRMInitReg, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C64r : RI<0x19, MRMInitReg, (outs GR64:$dst), (ins), "", + [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +} // isCodeGenOnly + + +def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C64r)>; + +def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C64r)>; + +// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and +// will be eliminated and that the sbb can be extended up to a wider type. When +// this happens, it is great. However, if we are left with an 8-bit sbb and an +// and, we might as well just match it as a setb. +def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), + (SETBr)>; + +// (add OP, SETB) -> (adc OP, 0) +def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op), + (ADC8ri GR8:$op, 0)>; +def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op), + (ADC64ri8 GR64:$op, 0)>; + +// (sub OP, SETB) -> (sbb OP, 0) +def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB64ri8 GR64:$op, 0)>; + +// (sub OP, SETCC_CARRY) -> (adc OP, 0) +def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC64ri8 GR64:$op, 0)>; + +//===----------------------------------------------------------------------===// +// String Pseudo Instructions +// +let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in { +def REP_MOVSB : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", + [(X86rep_movs i8)]>, REP; +def REP_MOVSW : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", + [(X86rep_movs i16)]>, REP, OpSize; +def REP_MOVSD : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", + [(X86rep_movs i32)]>, REP; +} + +let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in +def REP_MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}", + [(X86rep_movs i64)]>, REP; + + +// FIXME: Should use "(X86rep_stos AL)" as the pattern. +let Defs = [ECX,EDI], Uses = [AL,ECX,EDI], isCodeGenOnly = 1 in +def REP_STOSB : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", + [(X86rep_stos i8)]>, REP; +let Defs = [ECX,EDI], Uses = [AX,ECX,EDI], isCodeGenOnly = 1 in +def REP_STOSW : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", + [(X86rep_stos i16)]>, REP, OpSize; +let Defs = [ECX,EDI], Uses = [EAX,ECX,EDI], isCodeGenOnly = 1 in +def REP_STOSD : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", + [(X86rep_stos i32)]>, REP; + +let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI], isCodeGenOnly = 1 in +def REP_STOSQ : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}", + [(X86rep_stos i64)]>, REP; + + +//===----------------------------------------------------------------------===// +// Thread Local Storage Instructions +// + +// ELF TLS Support +// All calls clobber the non-callee saved registers. ESP is marked as +// a use to prevent stack-pointer assignments that appear immediately +// before calls from potentially appearing dead. +let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], + Uses = [ESP] in +def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLS_addr32", + [(X86tlsaddr tls32addr:$sym)]>, + Requires<[In32BitMode]>; + +// All calls clobber the non-callee saved registers. RSP is marked as +// a use to prevent stack-pointer assignments that appear immediately +// before calls from potentially appearing dead. +let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, + FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], + Uses = [RSP] in +def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), + "# TLS_addr64", + [(X86tlsaddr tls64addr:$sym)]>, + Requires<[In64BitMode]>; + +// Darwin TLS Support +// For i386, the address of the thunk is passed on the stack, on return the +// address of the variable is in %eax. %ecx is trashed during the function +// call. All other registers are preserved. +let Defs = [EAX, ECX, EFLAGS], + Uses = [ESP], + usesCustomInserter = 1 in +def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLSCall_32", + [(X86TLSCall addr:$sym)]>, + Requires<[In32BitMode]>; + +// For x86_64, the address of the thunk is passed in %rdi, on return +// the address of the variable is in %rax. All other registers are preserved. +let Defs = [RAX, EFLAGS], + Uses = [RSP, RDI], + usesCustomInserter = 1 in +def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym), + "# TLSCall_64", + [(X86TLSCall addr:$sym)]>, + Requires<[In64BitMode]>; + + +//===----------------------------------------------------------------------===// +// Conditional Move Pseudo Instructions + +// X86 doesn't have 8-bit conditional moves. Use a customInserter to +// emit control flow. An alternative to this is to mark i8 SELECT as Promote, +// however that requires promoting the operands, and can induce additional +// i8 register pressure. +let usesCustomInserter = 1, Uses = [EFLAGS] in { +def CMOV_GR8 : I<0, Pseudo, + (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond), + "#CMOV_GR8 PSEUDO!", + [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2, + imm:$cond, EFLAGS))]>; + +let Predicates = [NoCMov] in { +def CMOV_GR32 : I<0, Pseudo, + (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond), + "#CMOV_GR32* PSEUDO!", + [(set GR32:$dst, + (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>; +def CMOV_GR16 : I<0, Pseudo, + (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond), + "#CMOV_GR16* PSEUDO!", + [(set GR16:$dst, + (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>; +def CMOV_RFP32 : I<0, Pseudo, + (outs RFP32:$dst), + (ins RFP32:$src1, RFP32:$src2, i8imm:$cond), + "#CMOV_RFP32 PSEUDO!", + [(set RFP32:$dst, + (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond, + EFLAGS))]>; +def CMOV_RFP64 : I<0, Pseudo, + (outs RFP64:$dst), + (ins RFP64:$src1, RFP64:$src2, i8imm:$cond), + "#CMOV_RFP64 PSEUDO!", + [(set RFP64:$dst, + (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond, + EFLAGS))]>; +def CMOV_RFP80 : I<0, Pseudo, + (outs RFP80:$dst), + (ins RFP80:$src1, RFP80:$src2, i8imm:$cond), + "#CMOV_RFP80 PSEUDO!", + [(set RFP80:$dst, + (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond, + EFLAGS))]>; +} // Predicates = [NoCMov] +} // UsesCustomInserter = 1, Uses = [EFLAGS] + + +//===----------------------------------------------------------------------===// +// Atomic Instruction Pseudo Instructions +//===----------------------------------------------------------------------===// + +// Atomic exchange, and, or, xor +let Constraints = "$val = $dst", Defs = [EFLAGS], + usesCustomInserter = 1 in { + +def ATOMAND8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val), + "#ATOMAND8 PSEUDO!", + [(set GR8:$dst, (atomic_load_and_8 addr:$ptr, GR8:$val))]>; +def ATOMOR8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val), + "#ATOMOR8 PSEUDO!", + [(set GR8:$dst, (atomic_load_or_8 addr:$ptr, GR8:$val))]>; +def ATOMXOR8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val), + "#ATOMXOR8 PSEUDO!", + [(set GR8:$dst, (atomic_load_xor_8 addr:$ptr, GR8:$val))]>; +def ATOMNAND8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val), + "#ATOMNAND8 PSEUDO!", + [(set GR8:$dst, (atomic_load_nand_8 addr:$ptr, GR8:$val))]>; + +def ATOMAND16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val), + "#ATOMAND16 PSEUDO!", + [(set GR16:$dst, (atomic_load_and_16 addr:$ptr, GR16:$val))]>; +def ATOMOR16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val), + "#ATOMOR16 PSEUDO!", + [(set GR16:$dst, (atomic_load_or_16 addr:$ptr, GR16:$val))]>; +def ATOMXOR16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val), + "#ATOMXOR16 PSEUDO!", + [(set GR16:$dst, (atomic_load_xor_16 addr:$ptr, GR16:$val))]>; +def ATOMNAND16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val), + "#ATOMNAND16 PSEUDO!", + [(set GR16:$dst, (atomic_load_nand_16 addr:$ptr, GR16:$val))]>; +def ATOMMIN16: I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$ptr, GR16:$val), + "#ATOMMIN16 PSEUDO!", + [(set GR16:$dst, (atomic_load_min_16 addr:$ptr, GR16:$val))]>; +def ATOMMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val), + "#ATOMMAX16 PSEUDO!", + [(set GR16:$dst, (atomic_load_max_16 addr:$ptr, GR16:$val))]>; +def ATOMUMIN16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val), + "#ATOMUMIN16 PSEUDO!", + [(set GR16:$dst, (atomic_load_umin_16 addr:$ptr, GR16:$val))]>; +def ATOMUMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val), + "#ATOMUMAX16 PSEUDO!", + [(set GR16:$dst, (atomic_load_umax_16 addr:$ptr, GR16:$val))]>; + + +def ATOMAND32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val), + "#ATOMAND32 PSEUDO!", + [(set GR32:$dst, (atomic_load_and_32 addr:$ptr, GR32:$val))]>; +def ATOMOR32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val), + "#ATOMOR32 PSEUDO!", + [(set GR32:$dst, (atomic_load_or_32 addr:$ptr, GR32:$val))]>; +def ATOMXOR32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val), + "#ATOMXOR32 PSEUDO!", + [(set GR32:$dst, (atomic_load_xor_32 addr:$ptr, GR32:$val))]>; +def ATOMNAND32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val), + "#ATOMNAND32 PSEUDO!", + [(set GR32:$dst, (atomic_load_nand_32 addr:$ptr, GR32:$val))]>; +def ATOMMIN32: I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val), + "#ATOMMIN32 PSEUDO!", + [(set GR32:$dst, (atomic_load_min_32 addr:$ptr, GR32:$val))]>; +def ATOMMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val), + "#ATOMMAX32 PSEUDO!", + [(set GR32:$dst, (atomic_load_max_32 addr:$ptr, GR32:$val))]>; +def ATOMUMIN32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val), + "#ATOMUMIN32 PSEUDO!", + [(set GR32:$dst, (atomic_load_umin_32 addr:$ptr, GR32:$val))]>; +def ATOMUMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val), + "#ATOMUMAX32 PSEUDO!", + [(set GR32:$dst, (atomic_load_umax_32 addr:$ptr, GR32:$val))]>; + + + +def ATOMAND64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val), + "#ATOMAND64 PSEUDO!", + [(set GR64:$dst, (atomic_load_and_64 addr:$ptr, GR64:$val))]>; +def ATOMOR64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val), + "#ATOMOR64 PSEUDO!", + [(set GR64:$dst, (atomic_load_or_64 addr:$ptr, GR64:$val))]>; +def ATOMXOR64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val), + "#ATOMXOR64 PSEUDO!", + [(set GR64:$dst, (atomic_load_xor_64 addr:$ptr, GR64:$val))]>; +def ATOMNAND64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val), + "#ATOMNAND64 PSEUDO!", + [(set GR64:$dst, (atomic_load_nand_64 addr:$ptr, GR64:$val))]>; +def ATOMMIN64: I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$ptr, GR64:$val), + "#ATOMMIN64 PSEUDO!", + [(set GR64:$dst, (atomic_load_min_64 addr:$ptr, GR64:$val))]>; +def ATOMMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val), + "#ATOMMAX64 PSEUDO!", + [(set GR64:$dst, (atomic_load_max_64 addr:$ptr, GR64:$val))]>; +def ATOMUMIN64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val), + "#ATOMUMIN64 PSEUDO!", + [(set GR64:$dst, (atomic_load_umin_64 addr:$ptr, GR64:$val))]>; +def ATOMUMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val), + "#ATOMUMAX64 PSEUDO!", + [(set GR64:$dst, (atomic_load_umax_64 addr:$ptr, GR64:$val))]>; +} + +let Constraints = "$val1 = $dst1, $val2 = $dst2", + Defs = [EFLAGS, EAX, EBX, ECX, EDX], + Uses = [EAX, EBX, ECX, EDX], + mayLoad = 1, mayStore = 1, + usesCustomInserter = 1 in { +def ATOMAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), + (ins i64mem:$ptr, GR32:$val1, GR32:$val2), + "#ATOMAND6432 PSEUDO!", []>; +def ATOMOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), + (ins i64mem:$ptr, GR32:$val1, GR32:$val2), + "#ATOMOR6432 PSEUDO!", []>; +def ATOMXOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), + (ins i64mem:$ptr, GR32:$val1, GR32:$val2), + "#ATOMXOR6432 PSEUDO!", []>; +def ATOMNAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), + (ins i64mem:$ptr, GR32:$val1, GR32:$val2), + "#ATOMNAND6432 PSEUDO!", []>; +def ATOMADD6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), + (ins i64mem:$ptr, GR32:$val1, GR32:$val2), + "#ATOMADD6432 PSEUDO!", []>; +def ATOMSUB6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), + (ins i64mem:$ptr, GR32:$val1, GR32:$val2), + "#ATOMSUB6432 PSEUDO!", []>; +def ATOMSWAP6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2), + (ins i64mem:$ptr, GR32:$val1, GR32:$val2), + "#ATOMSWAP6432 PSEUDO!", []>; +} + +//===----------------------------------------------------------------------===// +// Normal-Instructions-With-Lock-Prefix Pseudo Instructions +//===----------------------------------------------------------------------===// + +// FIXME: Use normal instructions and add lock prefix dynamically. + +// Memory barriers + +// TODO: Get this to fold the constant into the instruction. +let isCodeGenOnly = 1 in +def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero), + "lock\n\t" + "or{l}\t{$zero, $dst|$dst, $zero}", + []>, Requires<[In32BitMode]>, LOCK; + +let hasSideEffects = 1 in +def Int_MemBarrier : I<0, Pseudo, (outs), (ins), + "#MEMBARRIER", + [(X86MemBarrier)]>; + +// TODO: Get this to fold the constant into the instruction. +let hasSideEffects = 1, Defs = [ESP], isCodeGenOnly = 1 in +def Int_MemBarrierNoSSE64 : RI<0x09, MRM1r, (outs), (ins GR64:$zero), + "lock\n\t" + "or{q}\t{$zero, (%rsp)|(%rsp), $zero}", + [(X86MemBarrierNoSSE GR64:$zero)]>, + Requires<[In64BitMode]>, LOCK; + + +// RegOpc corresponds to the mr version of the instruction +// ImmOpc corresponds to the mi version of the instruction +// ImmOpc8 corresponds to the mi8 version of the instruction +// ImmMod corresponds to the instruction format of the mi and mi8 versions +multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8, + Format ImmMod, string mnemonic> { +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in { + +def #NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 }, + MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), + !strconcat("lock\n\t", mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + !strconcat("lock\n\t", mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + []>, OpSize, LOCK; +def #NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + !strconcat("lock\n\t", mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + !strconcat("lock\n\t", mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, + ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2), + !strconcat("lock\n\t", mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2), + !strconcat("lock\n\t", mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), + !strconcat("lock\n\t", mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +def #NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; +def #NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), + !strconcat("lock\n\t", mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + []>, LOCK; + +} + +} + +defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">; +defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">; +defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">; +defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">; +defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">; + +// Optimized codegen when the non-memory output is not used. +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in { + +def LOCK_INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), + "lock\n\t" + "inc{b}\t$dst", []>, LOCK; +def LOCK_INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), + "lock\n\t" + "inc{w}\t$dst", []>, OpSize, LOCK; +def LOCK_INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), + "lock\n\t" + "inc{l}\t$dst", []>, LOCK; +def LOCK_INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), + "lock\n\t" + "inc{q}\t$dst", []>, LOCK; + +def LOCK_DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), + "lock\n\t" + "dec{b}\t$dst", []>, LOCK; +def LOCK_DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), + "lock\n\t" + "dec{w}\t$dst", []>, OpSize, LOCK; +def LOCK_DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), + "lock\n\t" + "dec{l}\t$dst", []>, LOCK; +def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), + "lock\n\t" + "dec{q}\t$dst", []>, LOCK; +} + +// Atomic compare and swap. +let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], + isCodeGenOnly = 1 in +def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr), + "lock\n\t" + "cmpxchg8b\t$ptr", + [(X86cas8 addr:$ptr)]>, TB, LOCK; + +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], + isCodeGenOnly = 1 in +def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr), + "lock\n\t" + "cmpxchg16b\t$ptr", + [(X86cas16 addr:$ptr)]>, TB, LOCK, + Requires<[HasCmpxchg16b]>; + +let Defs = [AL, EFLAGS], Uses = [AL], isCodeGenOnly = 1 in { +def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap), + "lock\n\t" + "cmpxchg{b}\t{$swap, $ptr|$ptr, $swap}", + [(X86cas addr:$ptr, GR8:$swap, 1)]>, TB, LOCK; +} + +let Defs = [AX, EFLAGS], Uses = [AX], isCodeGenOnly = 1 in { +def LCMPXCHG16 : I<0xB1, MRMDestMem, (outs), (ins i16mem:$ptr, GR16:$swap), + "lock\n\t" + "cmpxchg{w}\t{$swap, $ptr|$ptr, $swap}", + [(X86cas addr:$ptr, GR16:$swap, 2)]>, TB, OpSize, LOCK; +} + +let Defs = [EAX, EFLAGS], Uses = [EAX], isCodeGenOnly = 1 in { +def LCMPXCHG32 : I<0xB1, MRMDestMem, (outs), (ins i32mem:$ptr, GR32:$swap), + "lock\n\t" + "cmpxchg{l}\t{$swap, $ptr|$ptr, $swap}", + [(X86cas addr:$ptr, GR32:$swap, 4)]>, TB, LOCK; +} + +let Defs = [RAX, EFLAGS], Uses = [RAX], isCodeGenOnly = 1 in { +def LCMPXCHG64 : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$ptr, GR64:$swap), + "lock\n\t" + "cmpxchg{q}\t{$swap, $ptr|$ptr, $swap}", + [(X86cas addr:$ptr, GR64:$swap, 8)]>, TB, LOCK; +} + +// Atomic exchange and add +let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1 in { +def LXADD8 : I<0xC0, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr), + "lock\n\t" + "xadd{b}\t{$val, $ptr|$ptr, $val}", + [(set GR8:$dst, (atomic_load_add_8 addr:$ptr, GR8:$val))]>, + TB, LOCK; +def LXADD16 : I<0xC1, MRMSrcMem, (outs GR16:$dst), (ins GR16:$val, i16mem:$ptr), + "lock\n\t" + "xadd{w}\t{$val, $ptr|$ptr, $val}", + [(set GR16:$dst, (atomic_load_add_16 addr:$ptr, GR16:$val))]>, + TB, OpSize, LOCK; +def LXADD32 : I<0xC1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$val, i32mem:$ptr), + "lock\n\t" + "xadd{l}\t{$val, $ptr|$ptr, $val}", + [(set GR32:$dst, (atomic_load_add_32 addr:$ptr, GR32:$val))]>, + TB, LOCK; +def LXADD64 : RI<0xC1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$val,i64mem:$ptr), + "lock\n\t" + "xadd{q}\t{$val, $ptr|$ptr, $val}", + [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))]>, + TB, LOCK; +} + +def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR8:$dst, (atomic_load_8 addr:$src))]>; +def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR16:$dst, (atomic_load_16 addr:$src))]>; +def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR32:$dst, (atomic_load_32 addr:$src))]>; +def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR64:$dst, (atomic_load_64 addr:$src))]>; + +def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), + "#RELEASE_MOV PSEUDO!", + [(atomic_store_8 addr:$dst, GR8 :$src)]>; +def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src), + "#RELEASE_MOV PSEUDO!", + [(atomic_store_16 addr:$dst, GR16:$src)]>; +def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), + "#RELEASE_MOV PSEUDO!", + [(atomic_store_32 addr:$dst, GR32:$src)]>; +def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), + "#RELEASE_MOV PSEUDO!", + [(atomic_store_64 addr:$dst, GR64:$src)]>; + +//===----------------------------------------------------------------------===// +// Conditional Move Pseudo Instructions. +//===----------------------------------------------------------------------===// + + +// CMOV* - Used to implement the SSE SELECT DAG operation. Expanded after +// instruction selection into a branch sequence. +let Uses = [EFLAGS], usesCustomInserter = 1 in { + def CMOV_FR32 : I<0, Pseudo, + (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond), + "#CMOV_FR32 PSEUDO!", + [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond, + EFLAGS))]>; + def CMOV_FR64 : I<0, Pseudo, + (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond), + "#CMOV_FR64 PSEUDO!", + [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond, + EFLAGS))]>; + def CMOV_V4F32 : I<0, Pseudo, + (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + "#CMOV_V4F32 PSEUDO!", + [(set VR128:$dst, + (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V2F64 : I<0, Pseudo, + (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + "#CMOV_V2F64 PSEUDO!", + [(set VR128:$dst, + (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V2I64 : I<0, Pseudo, + (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), + "#CMOV_V2I64 PSEUDO!", + [(set VR128:$dst, + (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V8F32 : I<0, Pseudo, + (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + "#CMOV_V8F32 PSEUDO!", + [(set VR256:$dst, + (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V4F64 : I<0, Pseudo, + (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + "#CMOV_V4F64 PSEUDO!", + [(set VR256:$dst, + (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond, + EFLAGS)))]>; + def CMOV_V4I64 : I<0, Pseudo, + (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), + "#CMOV_V4I64 PSEUDO!", + [(set VR256:$dst, + (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond, + EFLAGS)))]>; +} + + +//===----------------------------------------------------------------------===// +// DAG Pattern Matching Rules +//===----------------------------------------------------------------------===// + +// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable +def : Pat<(i32 (X86Wrapper tconstpool :$dst)), (MOV32ri tconstpool :$dst)>; +def : Pat<(i32 (X86Wrapper tjumptable :$dst)), (MOV32ri tjumptable :$dst)>; +def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>; +def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>; +def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>; +def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>; + +def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)), + (ADD32ri GR32:$src1, tconstpool:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)), + (ADD32ri GR32:$src1, tjumptable:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)), + (ADD32ri GR32:$src1, tglobaladdr:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)), + (ADD32ri GR32:$src1, texternalsym:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)), + (ADD32ri GR32:$src1, tblockaddress:$src2)>; + +def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst), + (MOV32mi addr:$dst, tglobaladdr:$src)>; +def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst), + (MOV32mi addr:$dst, texternalsym:$src)>; +def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst), + (MOV32mi addr:$dst, tblockaddress:$src)>; + + + +// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small +// code model mode, should use 'movabs'. FIXME: This is really a hack, the +// 'movabs' predicate should handle this sort of thing. +def : Pat<(i64 (X86Wrapper tconstpool :$dst)), + (MOV64ri tconstpool :$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper tjumptable :$dst)), + (MOV64ri tjumptable :$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), + (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper texternalsym:$dst)), + (MOV64ri texternalsym:$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), + (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>; + +// In static codegen with small code model, we can get the address of a label +// into a register with 'movl'. FIXME: This is a hack, the 'imm' predicate of +// the MOV64ri64i32 should accept these. +def : Pat<(i64 (X86Wrapper tconstpool :$dst)), + (MOV64ri64i32 tconstpool :$dst)>, Requires<[SmallCode]>; +def : Pat<(i64 (X86Wrapper tjumptable :$dst)), + (MOV64ri64i32 tjumptable :$dst)>, Requires<[SmallCode]>; +def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), + (MOV64ri64i32 tglobaladdr :$dst)>, Requires<[SmallCode]>; +def : Pat<(i64 (X86Wrapper texternalsym:$dst)), + (MOV64ri64i32 texternalsym:$dst)>, Requires<[SmallCode]>; +def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), + (MOV64ri64i32 tblockaddress:$dst)>, Requires<[SmallCode]>; + +// In kernel code model, we can get the address of a label +// into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of +// the MOV64ri32 should accept these. +def : Pat<(i64 (X86Wrapper tconstpool :$dst)), + (MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper tjumptable :$dst)), + (MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), + (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper texternalsym:$dst)), + (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), + (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>; + +// If we have small model and -static mode, it is safe to store global addresses +// directly as immediates. FIXME: This is really a hack, the 'imm' predicate +// for MOV64mi32 should handle this sort of thing. +def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tconstpool:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tjumptable:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tglobaladdr:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst), + (MOV64mi32 addr:$dst, texternalsym:$src)>, + Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tblockaddress:$src)>, + Requires<[NearData, IsStatic]>; + + + +// Calls + +// tls has some funny stuff here... +// This corresponds to movabs $foo@tpoff, %rax +def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)), + (MOV64ri tglobaltlsaddr :$dst)>; +// This corresponds to add $foo@tpoff, %rax +def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)), + (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>; +// This corresponds to mov foo@tpoff(%rbx), %eax +def : Pat<(load (i64 (X86Wrapper tglobaltlsaddr :$dst))), + (MOV64rm tglobaltlsaddr :$dst)>; + + +// Direct PC relative function call for small code model. 32-bit displacement +// sign extended to 64-bit. +def : Pat<(X86call (i64 tglobaladdr:$dst)), + (CALL64pcrel32 tglobaladdr:$dst)>, Requires<[NotWin64]>; +def : Pat<(X86call (i64 texternalsym:$dst)), + (CALL64pcrel32 texternalsym:$dst)>, Requires<[NotWin64]>; + +def : Pat<(X86call (i64 tglobaladdr:$dst)), + (WINCALL64pcrel32 tglobaladdr:$dst)>, Requires<[IsWin64]>; +def : Pat<(X86call (i64 texternalsym:$dst)), + (WINCALL64pcrel32 texternalsym:$dst)>, Requires<[IsWin64]>; + +// tailcall stuff +def : Pat<(X86tcret GR32_TC:$dst, imm:$off), + (TCRETURNri GR32_TC:$dst, imm:$off)>, + Requires<[In32BitMode]>; + +// FIXME: This is disabled for 32-bit PIC mode because the global base +// register which is part of the address mode may be assigned a +// callee-saved register. +def : Pat<(X86tcret (load addr:$dst), imm:$off), + (TCRETURNmi addr:$dst, imm:$off)>, + Requires<[In32BitMode, IsNotPIC]>; + +def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), + (TCRETURNdi texternalsym:$dst, imm:$off)>, + Requires<[In32BitMode]>; + +def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off), + (TCRETURNdi texternalsym:$dst, imm:$off)>, + Requires<[In32BitMode]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[In64BitMode]>; + +def : Pat<(X86tcret (load addr:$dst), imm:$off), + (TCRETURNmi64 addr:$dst, imm:$off)>, + Requires<[In64BitMode]>; + +def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), + (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>, + Requires<[In64BitMode]>; + +def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off), + (TCRETURNdi64 texternalsym:$dst, imm:$off)>, + Requires<[In64BitMode]>; + +// Normal calls, with various flavors of addresses. +def : Pat<(X86call (i32 tglobaladdr:$dst)), + (CALLpcrel32 tglobaladdr:$dst)>; +def : Pat<(X86call (i32 texternalsym:$dst)), + (CALLpcrel32 texternalsym:$dst)>; +def : Pat<(X86call (i32 imm:$dst)), + (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>; + +// Comparisons. + +// TEST R,R is smaller than CMP R,0 +def : Pat<(X86cmp GR8:$src1, 0), + (TEST8rr GR8:$src1, GR8:$src1)>; +def : Pat<(X86cmp GR16:$src1, 0), + (TEST16rr GR16:$src1, GR16:$src1)>; +def : Pat<(X86cmp GR32:$src1, 0), + (TEST32rr GR32:$src1, GR32:$src1)>; +def : Pat<(X86cmp GR64:$src1, 0), + (TEST64rr GR64:$src1, GR64:$src1)>; + +// Conditional moves with folded loads with operands swapped and conditions +// inverted. +multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32, + Instruction Inst64> { + def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS), + (Inst16 GR16:$src2, addr:$src1)>; + def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS), + (Inst32 GR32:$src2, addr:$src1)>; + def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS), + (Inst64 GR64:$src2, addr:$src1)>; +} + +defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>; +defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>; +defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>; +defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>; +defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>; +defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>; +defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>; +defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>; +defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>; +defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>; +defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>; +defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>; +defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>; +defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>; +defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>; +defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>; + +// zextload bool -> zextload byte +def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>; +def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>; + +// extload bool -> extload byte +// When extloading from 16-bit and smaller memory locations into 64-bit +// registers, use zero-extending loads so that the entire 64-bit register is +// defined, avoiding partial-register updates. + +def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>; +def : Pat<(extloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>; + +def : Pat<(extloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>; +def : Pat<(extloadi64i8 addr:$src), (MOVZX64rm8 addr:$src)>; +def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>; +// For other extloads, use subregs, since the high contents of the register are +// defined after an extload. +def : Pat<(extloadi64i32 addr:$src), + (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), + sub_32bit)>; + +// anyext. Define these to do an explicit zero-extend to +// avoid partial-register updates. +def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG + (MOVZX32rr8 GR8 :$src), sub_16bit)>; +def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>; + +// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32. +def : Pat<(i32 (anyext GR16:$src)), + (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>; + +def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8 GR8 :$src)>; +def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16 :$src)>; +def : Pat<(i64 (anyext GR32:$src)), + (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; + + +// Any instruction that defines a 32-bit result leaves the high half of the +// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may +// be copying from a truncate. And x86's cmov doesn't do anything if the +// condition is false. But any other 32-bit operation will zero-extend +// up to 64 bits. +def def32 : PatLeaf<(i32 GR32:$src), [{ + return N->getOpcode() != ISD::TRUNCATE && + N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && + N->getOpcode() != ISD::CopyFromReg && + N->getOpcode() != X86ISD::CMOV; +}]>; + +// In the case of a 32-bit def that is known to implicitly zero-extend, +// we can use a SUBREG_TO_REG. +def : Pat<(i64 (zext def32:$src)), + (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; + +//===----------------------------------------------------------------------===// +// Pattern match OR as ADD +//===----------------------------------------------------------------------===// + +// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be +// 3-addressified into an LEA instruction to avoid copies. However, we also +// want to finally emit these instructions as an or at the end of the code +// generator to make the generated code easier to read. To do this, we select +// into "disjoint bits" pseudo ops. + +// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero. +def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{ + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1))) + return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue()); + + unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits(); + APInt Mask = APInt::getAllOnesValue(BitWidth); + APInt KnownZero0, KnownOne0; + CurDAG->ComputeMaskedBits(N->getOperand(0), Mask, KnownZero0, KnownOne0, 0); + APInt KnownZero1, KnownOne1; + CurDAG->ComputeMaskedBits(N->getOperand(1), Mask, KnownZero1, KnownOne1, 0); + return (~KnownZero0 & ~KnownZero1) == 0; +}]>; + + +// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits. +let AddedComplexity = 5 in { // Try this before the selecting to OR + +let isConvertibleToThreeAddress = 1, + Constraints = "$src1 = $dst", Defs = [EFLAGS] in { +let isCommutable = 1 in { +def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + "", // orw/addw REG, REG + [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>; +def ADD32rr_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + "", // orl/addl REG, REG + [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>; +def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + "", // orq/addq REG, REG + [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>; +} // isCommutable + +// NOTE: These are order specific, we want the ri8 forms to be listed +// first so that they are slightly preferred to the ri forms. + +def ADD16ri8_DB : I<0, Pseudo, + (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), + "", // orw/addw REG, imm8 + [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>; +def ADD16ri_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), + "", // orw/addw REG, imm + [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>; + +def ADD32ri8_DB : I<0, Pseudo, + (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), + "", // orl/addl REG, imm8 + [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>; +def ADD32ri_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), + "", // orl/addl REG, imm + [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>; + + +def ADD64ri8_DB : I<0, Pseudo, + (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), + "", // orq/addq REG, imm8 + [(set GR64:$dst, (or_is_add GR64:$src1, + i64immSExt8:$src2))]>; +def ADD64ri32_DB : I<0, Pseudo, + (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), + "", // orq/addq REG, imm + [(set GR64:$dst, (or_is_add GR64:$src1, + i64immSExt32:$src2))]>; +} +} // AddedComplexity + + +//===----------------------------------------------------------------------===// +// Some peepholes +//===----------------------------------------------------------------------===// + +// Odd encoding trick: -128 fits into an 8-bit immediate field while +// +128 doesn't, so in this special case use a sub instead of an add. +def : Pat<(add GR16:$src1, 128), + (SUB16ri8 GR16:$src1, -128)>; +def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst), + (SUB16mi8 addr:$dst, -128)>; + +def : Pat<(add GR32:$src1, 128), + (SUB32ri8 GR32:$src1, -128)>; +def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst), + (SUB32mi8 addr:$dst, -128)>; + +def : Pat<(add GR64:$src1, 128), + (SUB64ri8 GR64:$src1, -128)>; +def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst), + (SUB64mi8 addr:$dst, -128)>; + +// The same trick applies for 32-bit immediate fields in 64-bit +// instructions. +def : Pat<(add GR64:$src1, 0x0000000080000000), + (SUB64ri32 GR64:$src1, 0xffffffff80000000)>; +def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst), + (SUB64mi32 addr:$dst, 0xffffffff80000000)>; + +// To avoid needing to materialize an immediate in a register, use a 32-bit and +// with implicit zero-extension instead of a 64-bit and if the immediate has at +// least 32 bits of leading zeros. If in addition the last 32 bits can be +// represented with a sign extension of a 8 bit constant, use that. + +def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm), + (SUBREG_TO_REG + (i64 0), + (AND32ri8 + (EXTRACT_SUBREG GR64:$src, sub_32bit), + (i32 (GetLo8XForm imm:$imm))), + sub_32bit)>; + +def : Pat<(and GR64:$src, i64immZExt32:$imm), + (SUBREG_TO_REG + (i64 0), + (AND32ri + (EXTRACT_SUBREG GR64:$src, sub_32bit), + (i32 (GetLo32XForm imm:$imm))), + sub_32bit)>; + + +// r & (2^16-1) ==> movz +def : Pat<(and GR32:$src1, 0xffff), + (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>; +// r & (2^8-1) ==> movz +def : Pat<(and GR32:$src1, 0xff), + (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1, + GR32_ABCD)), + sub_8bit))>, + Requires<[In32BitMode]>; +// r & (2^8-1) ==> movz +def : Pat<(and GR16:$src1, 0xff), + (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG + (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)), + sub_16bit)>, + Requires<[In32BitMode]>; + +// r & (2^32-1) ==> movz +def : Pat<(and GR64:$src, 0x00000000FFFFFFFF), + (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>; +// r & (2^16-1) ==> movz +def : Pat<(and GR64:$src, 0xffff), + (MOVZX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit)))>; +// r & (2^8-1) ==> movz +def : Pat<(and GR64:$src, 0xff), + (MOVZX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit)))>; +// r & (2^8-1) ==> movz +def : Pat<(and GR32:$src1, 0xff), + (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>, + Requires<[In64BitMode]>; +// r & (2^8-1) ==> movz +def : Pat<(and GR16:$src1, 0xff), + (EXTRACT_SUBREG (MOVZX32rr8 (i8 + (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>, + Requires<[In64BitMode]>; + + +// sext_inreg patterns +def : Pat<(sext_inreg GR32:$src, i16), + (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>; +def : Pat<(sext_inreg GR32:$src, i8), + (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit))>, + Requires<[In32BitMode]>; + +def : Pat<(sext_inreg GR16:$src, i8), + (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG + (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))), + sub_16bit)>, + Requires<[In32BitMode]>; + +def : Pat<(sext_inreg GR64:$src, i32), + (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>; +def : Pat<(sext_inreg GR64:$src, i16), + (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>; +def : Pat<(sext_inreg GR64:$src, i8), + (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>; +def : Pat<(sext_inreg GR32:$src, i8), + (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>, + Requires<[In64BitMode]>; +def : Pat<(sext_inreg GR16:$src, i8), + (EXTRACT_SUBREG (MOVSX32rr8 + (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>, + Requires<[In64BitMode]>; + +// sext, sext_load, zext, zext_load +def: Pat<(i16 (sext GR8:$src)), + (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(sextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>; +def: Pat<(i16 (zext GR8:$src)), + (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(zextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>; + +// trunc patterns +def : Pat<(i16 (trunc GR32:$src)), + (EXTRACT_SUBREG GR32:$src, sub_16bit)>; +def : Pat<(i8 (trunc GR32:$src)), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit)>, + Requires<[In32BitMode]>; +def : Pat<(i8 (trunc GR16:$src)), + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit)>, + Requires<[In32BitMode]>; +def : Pat<(i32 (trunc GR64:$src)), + (EXTRACT_SUBREG GR64:$src, sub_32bit)>; +def : Pat<(i16 (trunc GR64:$src)), + (EXTRACT_SUBREG GR64:$src, sub_16bit)>; +def : Pat<(i8 (trunc GR64:$src)), + (EXTRACT_SUBREG GR64:$src, sub_8bit)>; +def : Pat<(i8 (trunc GR32:$src)), + (EXTRACT_SUBREG GR32:$src, sub_8bit)>, + Requires<[In64BitMode]>; +def : Pat<(i8 (trunc GR16:$src)), + (EXTRACT_SUBREG GR16:$src, sub_8bit)>, + Requires<[In64BitMode]>; + +// h-register tricks +def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))), + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)>, + Requires<[In32BitMode]>; +def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit_hi)>, + Requires<[In32BitMode]>; +def : Pat<(srl GR16:$src, (i8 8)), + (EXTRACT_SUBREG + (MOVZX32rr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_16bit)>, + Requires<[In32BitMode]>; +def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))), + (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, + GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In32BitMode]>; +def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), + (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, + GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In32BitMode]>; +def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), + (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In32BitMode]>; +def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)), + (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In32BitMode]>; + +// h-register tricks. +// For now, be conservative on x86-64 and use an h-register extract only if the +// value is immediately zero-extended or stored, which are somewhat common +// cases. This uses a bunch of code to prevent a register requiring a REX prefix +// from being allocated in the same instruction as the h register, as there's +// currently no way to describe this requirement to the register allocator. + +// h-register extract and zero-extend. +def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)), + (SUBREG_TO_REG + (i64 0), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)), + sub_8bit_hi)), + sub_32bit)>; +def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)), + (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, + GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(srl GR16:$src, (i8 8)), + (EXTRACT_SUBREG + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_16bit)>, + Requires<[In64BitMode]>; +def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))), + (SUBREG_TO_REG + (i64 0), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_32bit)>; +def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))), + (SUBREG_TO_REG + (i64 0), + (MOVZX32_NOREXrr8 + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi)), + sub_32bit)>; + +// h-register extract and store. +def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst), + (MOV8mr_NOREX + addr:$dst, + (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)), + sub_8bit_hi))>; +def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst), + (MOV8mr_NOREX + addr:$dst, + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst), + (MOV8mr_NOREX + addr:$dst, + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit_hi))>, + Requires<[In64BitMode]>; + + +// (shl x, 1) ==> (add x, x) +// Note that if x is undef (immediate or otherwise), we could theoretically +// end up with the two uses of x getting different values, producing a result +// where the least significant bit is not 0. However, the probability of this +// happening is considered low enough that this is officially not a +// "real problem". +def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>; +def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>; +def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; +def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>; + +// (shl x (and y, 31)) ==> (shl x, y) +def : Pat<(shl GR8:$src1, (and CL, 31)), + (SHL8rCL GR8:$src1)>; +def : Pat<(shl GR16:$src1, (and CL, 31)), + (SHL16rCL GR16:$src1)>; +def : Pat<(shl GR32:$src1, (and CL, 31)), + (SHL32rCL GR32:$src1)>; +def : Pat<(store (shl (loadi8 addr:$dst), (and CL, 31)), addr:$dst), + (SHL8mCL addr:$dst)>; +def : Pat<(store (shl (loadi16 addr:$dst), (and CL, 31)), addr:$dst), + (SHL16mCL addr:$dst)>; +def : Pat<(store (shl (loadi32 addr:$dst), (and CL, 31)), addr:$dst), + (SHL32mCL addr:$dst)>; + +def : Pat<(srl GR8:$src1, (and CL, 31)), + (SHR8rCL GR8:$src1)>; +def : Pat<(srl GR16:$src1, (and CL, 31)), + (SHR16rCL GR16:$src1)>; +def : Pat<(srl GR32:$src1, (and CL, 31)), + (SHR32rCL GR32:$src1)>; +def : Pat<(store (srl (loadi8 addr:$dst), (and CL, 31)), addr:$dst), + (SHR8mCL addr:$dst)>; +def : Pat<(store (srl (loadi16 addr:$dst), (and CL, 31)), addr:$dst), + (SHR16mCL addr:$dst)>; +def : Pat<(store (srl (loadi32 addr:$dst), (and CL, 31)), addr:$dst), + (SHR32mCL addr:$dst)>; + +def : Pat<(sra GR8:$src1, (and CL, 31)), + (SAR8rCL GR8:$src1)>; +def : Pat<(sra GR16:$src1, (and CL, 31)), + (SAR16rCL GR16:$src1)>; +def : Pat<(sra GR32:$src1, (and CL, 31)), + (SAR32rCL GR32:$src1)>; +def : Pat<(store (sra (loadi8 addr:$dst), (and CL, 31)), addr:$dst), + (SAR8mCL addr:$dst)>; +def : Pat<(store (sra (loadi16 addr:$dst), (and CL, 31)), addr:$dst), + (SAR16mCL addr:$dst)>; +def : Pat<(store (sra (loadi32 addr:$dst), (and CL, 31)), addr:$dst), + (SAR32mCL addr:$dst)>; + +// (shl x (and y, 63)) ==> (shl x, y) +def : Pat<(shl GR64:$src1, (and CL, 63)), + (SHL64rCL GR64:$src1)>; +def : Pat<(store (shl (loadi64 addr:$dst), (and CL, 63)), addr:$dst), + (SHL64mCL addr:$dst)>; + +def : Pat<(srl GR64:$src1, (and CL, 63)), + (SHR64rCL GR64:$src1)>; +def : Pat<(store (srl (loadi64 addr:$dst), (and CL, 63)), addr:$dst), + (SHR64mCL addr:$dst)>; + +def : Pat<(sra GR64:$src1, (and CL, 63)), + (SAR64rCL GR64:$src1)>; +def : Pat<(store (sra (loadi64 addr:$dst), (and CL, 63)), addr:$dst), + (SAR64mCL addr:$dst)>; + + +// (anyext (setcc_carry)) -> (setcc_carry) +def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; + + + + +//===----------------------------------------------------------------------===// +// EFLAGS-defining Patterns +//===----------------------------------------------------------------------===// + +// add reg, reg +def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>; +def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>; + +// add reg, mem +def : Pat<(add GR8:$src1, (loadi8 addr:$src2)), + (ADD8rm GR8:$src1, addr:$src2)>; +def : Pat<(add GR16:$src1, (loadi16 addr:$src2)), + (ADD16rm GR16:$src1, addr:$src2)>; +def : Pat<(add GR32:$src1, (loadi32 addr:$src2)), + (ADD32rm GR32:$src1, addr:$src2)>; + +// add reg, imm +def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>; +def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>; +def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>; +def : Pat<(add GR16:$src1, i16immSExt8:$src2), + (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(add GR32:$src1, i32immSExt8:$src2), + (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>; + +// sub reg, reg +def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>; +def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>; + +// sub reg, mem +def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)), + (SUB8rm GR8:$src1, addr:$src2)>; +def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)), + (SUB16rm GR16:$src1, addr:$src2)>; +def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)), + (SUB32rm GR32:$src1, addr:$src2)>; + +// sub reg, imm +def : Pat<(sub GR8:$src1, imm:$src2), + (SUB8ri GR8:$src1, imm:$src2)>; +def : Pat<(sub GR16:$src1, imm:$src2), + (SUB16ri GR16:$src1, imm:$src2)>; +def : Pat<(sub GR32:$src1, imm:$src2), + (SUB32ri GR32:$src1, imm:$src2)>; +def : Pat<(sub GR16:$src1, i16immSExt8:$src2), + (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(sub GR32:$src1, i32immSExt8:$src2), + (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>; + +// mul reg, reg +def : Pat<(mul GR16:$src1, GR16:$src2), + (IMUL16rr GR16:$src1, GR16:$src2)>; +def : Pat<(mul GR32:$src1, GR32:$src2), + (IMUL32rr GR32:$src1, GR32:$src2)>; + +// mul reg, mem +def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)), + (IMUL16rm GR16:$src1, addr:$src2)>; +def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)), + (IMUL32rm GR32:$src1, addr:$src2)>; + +// mul reg, imm +def : Pat<(mul GR16:$src1, imm:$src2), + (IMUL16rri GR16:$src1, imm:$src2)>; +def : Pat<(mul GR32:$src1, imm:$src2), + (IMUL32rri GR32:$src1, imm:$src2)>; +def : Pat<(mul GR16:$src1, i16immSExt8:$src2), + (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(mul GR32:$src1, i32immSExt8:$src2), + (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>; + +// reg = mul mem, imm +def : Pat<(mul (loadi16 addr:$src1), imm:$src2), + (IMUL16rmi addr:$src1, imm:$src2)>; +def : Pat<(mul (loadi32 addr:$src1), imm:$src2), + (IMUL32rmi addr:$src1, imm:$src2)>; +def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2), + (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>; +def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2), + (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>; + +// Patterns for nodes that do not produce flags, for instructions that do. + +// addition +def : Pat<(add GR64:$src1, GR64:$src2), + (ADD64rr GR64:$src1, GR64:$src2)>; +def : Pat<(add GR64:$src1, i64immSExt8:$src2), + (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(add GR64:$src1, i64immSExt32:$src2), + (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>; +def : Pat<(add GR64:$src1, (loadi64 addr:$src2)), + (ADD64rm GR64:$src1, addr:$src2)>; + +// subtraction +def : Pat<(sub GR64:$src1, GR64:$src2), + (SUB64rr GR64:$src1, GR64:$src2)>; +def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)), + (SUB64rm GR64:$src1, addr:$src2)>; +def : Pat<(sub GR64:$src1, i64immSExt8:$src2), + (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(sub GR64:$src1, i64immSExt32:$src2), + (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// Multiply +def : Pat<(mul GR64:$src1, GR64:$src2), + (IMUL64rr GR64:$src1, GR64:$src2)>; +def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)), + (IMUL64rm GR64:$src1, addr:$src2)>; +def : Pat<(mul GR64:$src1, i64immSExt8:$src2), + (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(mul GR64:$src1, i64immSExt32:$src2), + (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>; +def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2), + (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>; +def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2), + (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>; + +// Increment reg. +def : Pat<(add GR8 :$src, 1), (INC8r GR8 :$src)>; +def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>, Requires<[In32BitMode]>; +def : Pat<(add GR16:$src, 1), (INC64_16r GR16:$src)>, Requires<[In64BitMode]>; +def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>, Requires<[In32BitMode]>; +def : Pat<(add GR32:$src, 1), (INC64_32r GR32:$src)>, Requires<[In64BitMode]>; +def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>; + +// Decrement reg. +def : Pat<(add GR8 :$src, -1), (DEC8r GR8 :$src)>; +def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>, Requires<[In32BitMode]>; +def : Pat<(add GR16:$src, -1), (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>; +def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>, Requires<[In32BitMode]>; +def : Pat<(add GR32:$src, -1), (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>; +def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>; + +// or reg/reg. +def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>; +def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>; +def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>; + +// or reg/mem +def : Pat<(or GR8:$src1, (loadi8 addr:$src2)), + (OR8rm GR8:$src1, addr:$src2)>; +def : Pat<(or GR16:$src1, (loadi16 addr:$src2)), + (OR16rm GR16:$src1, addr:$src2)>; +def : Pat<(or GR32:$src1, (loadi32 addr:$src2)), + (OR32rm GR32:$src1, addr:$src2)>; +def : Pat<(or GR64:$src1, (loadi64 addr:$src2)), + (OR64rm GR64:$src1, addr:$src2)>; + +// or reg/imm +def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri GR8 :$src1, imm:$src2)>; +def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>; +def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>; +def : Pat<(or GR16:$src1, i16immSExt8:$src2), + (OR16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(or GR32:$src1, i32immSExt8:$src2), + (OR32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(or GR64:$src1, i64immSExt8:$src2), + (OR64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(or GR64:$src1, i64immSExt32:$src2), + (OR64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// xor reg/reg +def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>; +def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>; +def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>; + +// xor reg/mem +def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)), + (XOR8rm GR8:$src1, addr:$src2)>; +def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)), + (XOR16rm GR16:$src1, addr:$src2)>; +def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)), + (XOR32rm GR32:$src1, addr:$src2)>; +def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)), + (XOR64rm GR64:$src1, addr:$src2)>; + +// xor reg/imm +def : Pat<(xor GR8:$src1, imm:$src2), + (XOR8ri GR8:$src1, imm:$src2)>; +def : Pat<(xor GR16:$src1, imm:$src2), + (XOR16ri GR16:$src1, imm:$src2)>; +def : Pat<(xor GR32:$src1, imm:$src2), + (XOR32ri GR32:$src1, imm:$src2)>; +def : Pat<(xor GR16:$src1, i16immSExt8:$src2), + (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(xor GR32:$src1, i32immSExt8:$src2), + (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(xor GR64:$src1, i64immSExt8:$src2), + (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(xor GR64:$src1, i64immSExt32:$src2), + (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// and reg/reg +def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>; +def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>; +def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>; + +// and reg/mem +def : Pat<(and GR8:$src1, (loadi8 addr:$src2)), + (AND8rm GR8:$src1, addr:$src2)>; +def : Pat<(and GR16:$src1, (loadi16 addr:$src2)), + (AND16rm GR16:$src1, addr:$src2)>; +def : Pat<(and GR32:$src1, (loadi32 addr:$src2)), + (AND32rm GR32:$src1, addr:$src2)>; +def : Pat<(and GR64:$src1, (loadi64 addr:$src2)), + (AND64rm GR64:$src1, addr:$src2)>; + +// and reg/imm +def : Pat<(and GR8:$src1, imm:$src2), + (AND8ri GR8:$src1, imm:$src2)>; +def : Pat<(and GR16:$src1, imm:$src2), + (AND16ri GR16:$src1, imm:$src2)>; +def : Pat<(and GR32:$src1, imm:$src2), + (AND32ri GR32:$src1, imm:$src2)>; +def : Pat<(and GR16:$src1, i16immSExt8:$src2), + (AND16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(and GR32:$src1, i32immSExt8:$src2), + (AND32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(and GR64:$src1, i64immSExt8:$src2), + (AND64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(and GR64:$src1, i64immSExt32:$src2), + (AND64ri32 GR64:$src1, i64immSExt32:$src2)>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td new file mode 100644 index 0000000..c228a0ae --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td @@ -0,0 +1,304 @@ +//===- X86InstrControl.td - Control Flow Instructions ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 jump, return, call, and related instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Control Flow Instructions. +// + +// Return instructions. +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, FPForm = SpecialFP in { + def RET : I <0xC3, RawFrm, (outs), (ins variable_ops), + "ret", + [(X86retflag 0)]>; + def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops), + "ret\t$amt", + [(X86retflag timm:$amt)]>; + def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops), + "retw\t$amt", + []>, OpSize; + def LRETL : I <0xCB, RawFrm, (outs), (ins), + "lretl", []>; + def LRETQ : RI <0xCB, RawFrm, (outs), (ins), + "lretq", []>; + def LRETI : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt), + "lret\t$amt", []>; + def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt), + "lretw\t$amt", []>, OpSize; +} + +// Unconditional branches. +let isBarrier = 1, isBranch = 1, isTerminator = 1 in { + def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst), + "jmp\t$dst", [(br bb:$dst)]>; + def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst), + "jmp\t$dst", []>; + def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst), + "jmp{q}\t$dst", []>; +} + +// Conditional Branches. +let isBranch = 1, isTerminator = 1, Uses = [EFLAGS] in { + multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> { + def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, []>; + def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm, + [(X86brcond bb:$dst, Cond, EFLAGS)]>, TB; + } +} + +defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>; +defm JNO : ICBr<0x71, 0x81, "jno\t$dst" , X86_COND_NO>; +defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>; +defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>; +defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>; +defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>; +defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>; +defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>; +defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>; +defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>; +defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>; +defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>; +defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>; +defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>; +defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>; +defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>; + +// jcx/jecx/jrcx instructions. +let isAsmParserOnly = 1, isBranch = 1, isTerminator = 1 in { + // These are the 32-bit versions of this instruction for the asmparser. In + // 32-bit mode, the address size prefix is jcxz and the unprefixed version is + // jecxz. + let Uses = [CX] in + def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jcxz\t$dst", []>, AdSize, Requires<[In32BitMode]>; + let Uses = [ECX] in + def JECXZ_32 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jecxz\t$dst", []>, Requires<[In32BitMode]>; + + // J*CXZ instruction: 64-bit versions of this instruction for the asmparser. + // In 64-bit mode, the address size prefix is jecxz and the unprefixed version + // is jrcxz. + let Uses = [ECX] in + def JECXZ_64 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jecxz\t$dst", []>, AdSize, Requires<[In64BitMode]>; + let Uses = [RCX] in + def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jrcxz\t$dst", []>, Requires<[In64BitMode]>; +} + +// Indirect branches +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst", + [(brind GR32:$dst)]>, Requires<[In32BitMode]>; + def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst", + [(brind (loadi32 addr:$dst))]>, Requires<[In32BitMode]>; + + def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst", + [(brind GR64:$dst)]>, Requires<[In64BitMode]>; + def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst", + [(brind (loadi64 addr:$dst))]>, Requires<[In64BitMode]>; + + def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs), + (ins i16imm:$off, i16imm:$seg), + "ljmp{w}\t{$seg, $off|$off, $seg}", []>, OpSize; + def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs), + (ins i32imm:$off, i16imm:$seg), + "ljmp{l}\t{$seg, $off|$off, $seg}", []>; + def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst), + "ljmp{q}\t{*}$dst", []>; + + def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst), + "ljmp{w}\t{*}$dst", []>, OpSize; + def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst), + "ljmp{l}\t{*}$dst", []>; +} + + +// Loop instructions + +def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", []>; +def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", []>; +def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", []>; + +//===----------------------------------------------------------------------===// +// Call Instructions... +// +let isCall = 1 in + // All calls clobber the non-callee saved registers. ESP is marked as + // a use to prevent stack-pointer assignments that appear immediately + // before calls from potentially appearing dead. Uses for argument + // registers are added manually. + let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], + Uses = [ESP] in { + def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm, + (outs), (ins i32imm_pcrel:$dst,variable_ops), + "call{l}\t$dst", []>, Requires<[In32BitMode]>; + def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst, variable_ops), + "call{l}\t{*}$dst", [(X86call GR32:$dst)]>, + Requires<[In32BitMode]>; + def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops), + "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))]>, + Requires<[In32BitMode]>; + + def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs), + (ins i16imm:$off, i16imm:$seg), + "lcall{w}\t{$seg, $off|$off, $seg}", []>, OpSize; + def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs), + (ins i32imm:$off, i16imm:$seg), + "lcall{l}\t{$seg, $off|$off, $seg}", []>; + + def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst), + "lcall{w}\t{*}$dst", []>, OpSize; + def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst), + "lcall{l}\t{*}$dst", []>; + + // callw for 16 bit code for the assembler. + let isAsmParserOnly = 1 in + def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm, + (outs), (ins i16imm_pcrel:$dst, variable_ops), + "callw\t$dst", []>, OpSize; + } + + +// Tail call stuff. + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, + isCodeGenOnly = 1 in + let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], + Uses = [ESP] in { + def TCRETURNdi : PseudoI<(outs), + (ins i32imm_pcrel:$dst, i32imm:$offset, variable_ops), []>; + def TCRETURNri : PseudoI<(outs), + (ins GR32_TC:$dst, i32imm:$offset, variable_ops), []>; + let mayLoad = 1 in + def TCRETURNmi : PseudoI<(outs), + (ins i32mem_TC:$dst, i32imm:$offset, variable_ops), []>; + + // FIXME: The should be pseudo instructions that are lowered when going to + // mcinst. + def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs), + (ins i32imm_pcrel:$dst, variable_ops), + "jmp\t$dst # TAILCALL", + []>; + def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32_TC:$dst, variable_ops), + "", []>; // FIXME: Remove encoding when JIT is dead. + let mayLoad = 1 in + def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst, variable_ops), + "jmp{l}\t{*}$dst # TAILCALL", []>; +} + + +//===----------------------------------------------------------------------===// +// Call Instructions... +// +let isCall = 1 in + // All calls clobber the non-callee saved registers. RSP is marked as + // a use to prevent stack-pointer assignments that appear immediately + // before calls from potentially appearing dead. Uses for argument + // registers are added manually. + let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, + FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], + Uses = [RSP] in { + + // NOTE: this pattern doesn't match "X86call imm", because we do not know + // that the offset between an arbitrary immediate and the call will fit in + // the 32-bit pcrel field that we have. + def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm, + (outs), (ins i64i32imm_pcrel:$dst, variable_ops), + "call{q}\t$dst", []>, + Requires<[In64BitMode, NotWin64]>; + def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops), + "call{q}\t{*}$dst", [(X86call GR64:$dst)]>, + Requires<[In64BitMode, NotWin64]>; + def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops), + "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>, + Requires<[In64BitMode, NotWin64]>; + + def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst), + "lcall{q}\t{*}$dst", []>; + } + + // FIXME: We need to teach codegen about single list of call-clobbered + // registers. +let isCall = 1, isCodeGenOnly = 1 in + // All calls clobber the non-callee saved registers. RSP is marked as + // a use to prevent stack-pointer assignments that appear immediately + // before calls from potentially appearing dead. Uses for argument + // registers are added manually. + let Defs = [RAX, RCX, RDX, R8, R9, R10, R11, + FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, EFLAGS], + Uses = [RSP] in { + def WINCALL64pcrel32 : Ii32PCRel<0xE8, RawFrm, + (outs), (ins i64i32imm_pcrel:$dst, variable_ops), + "call{q}\t$dst", []>, + Requires<[IsWin64]>; + def WINCALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops), + "call{q}\t{*}$dst", + [(X86call GR64:$dst)]>, Requires<[IsWin64]>; + def WINCALL64m : I<0xFF, MRM2m, (outs), + (ins i64mem:$dst,variable_ops), + "call{q}\t{*}$dst", + [(X86call (loadi64 addr:$dst))]>, + Requires<[IsWin64]>; + } + +let isCall = 1, isCodeGenOnly = 1 in + // __chkstk(MSVC): clobber R10, R11 and EFLAGS. + // ___chkstk(Mingw64): clobber R10, R11, RAX and EFLAGS, and update RSP. + let Defs = [RAX, R10, R11, RSP, EFLAGS], + Uses = [RSP] in { + def W64ALLOCA : Ii32PCRel<0xE8, RawFrm, + (outs), (ins i64i32imm_pcrel:$dst, variable_ops), + "call{q}\t$dst", []>, + Requires<[IsWin64]>; + } + +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, + isCodeGenOnly = 1 in + // AMD64 cc clobbers RSI, RDI, XMM6-XMM15. + let Defs = [RAX, RCX, RDX, R8, R9, R10, R11, + FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, EFLAGS], + Uses = [RSP], + usesCustomInserter = 1 in { + def TCRETURNdi64 : PseudoI<(outs), + (ins i64i32imm_pcrel:$dst, i32imm:$offset, variable_ops), + []>; + def TCRETURNri64 : PseudoI<(outs), + (ins ptr_rc_tailcall:$dst, i32imm:$offset, variable_ops), []>; + let mayLoad = 1 in + def TCRETURNmi64 : PseudoI<(outs), + (ins i64mem_TC:$dst, i32imm:$offset, variable_ops), []>; + + def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), + (ins i64i32imm_pcrel:$dst, variable_ops), + "jmp\t$dst # TAILCALL", []>; + def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst, variable_ops), + "jmp{q}\t{*}$dst # TAILCALL", []>; + + let mayLoad = 1 in + def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst, variable_ops), + "jmp{q}\t{*}$dst # TAILCALL", []>; +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td new file mode 100644 index 0000000..e62e6b7 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td @@ -0,0 +1,151 @@ +//===- X86InstrExtension.td - Sign and Zero Extensions -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the sign and zero extension operations. +// +//===----------------------------------------------------------------------===// + +let neverHasSideEffects = 1 in { + let Defs = [AX], Uses = [AL] in + def CBW : I<0x98, RawFrm, (outs), (ins), + "{cbtw|cbw}", []>, OpSize; // AX = signext(AL) + let Defs = [EAX], Uses = [AX] in + def CWDE : I<0x98, RawFrm, (outs), (ins), + "{cwtl|cwde}", []>; // EAX = signext(AX) + + let Defs = [AX,DX], Uses = [AX] in + def CWD : I<0x99, RawFrm, (outs), (ins), + "{cwtd|cwd}", []>, OpSize; // DX:AX = signext(AX) + let Defs = [EAX,EDX], Uses = [EAX] in + def CDQ : I<0x99, RawFrm, (outs), (ins), + "{cltd|cdq}", []>; // EDX:EAX = signext(EAX) + + + let Defs = [RAX], Uses = [EAX] in + def CDQE : RI<0x98, RawFrm, (outs), (ins), + "{cltq|cdqe}", []>; // RAX = signext(EAX) + + let Defs = [RAX,RDX], Uses = [RAX] in + def CQO : RI<0x99, RawFrm, (outs), (ins), + "{cqto|cqo}", []>; // RDX:RAX = signext(RAX) +} + + +// Sign/Zero extenders +def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), + "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), + "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src), + "movs{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sext GR8:$src))]>, TB; +def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src), + "movs{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sextloadi32i8 addr:$src))]>, TB; +def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src), + "movs{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sext GR16:$src))]>, TB; +def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), + "movs{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB; + +def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), + "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), + "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src), + "movz{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zext GR8:$src))]>, TB; +def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src), + "movz{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zextloadi32i8 addr:$src))]>, TB; +def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src), + "movz{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zext GR16:$src))]>, TB; +def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), + "movz{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zextloadi32i16 addr:$src))]>, TB; + +// These are the same as the regular MOVZX32rr8 and MOVZX32rm8 +// except that they use GR32_NOREX for the output operand register class +// instead of GR32. This allows them to operate on h registers on x86-64. +def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg, + (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), + "movz{bl|x}\t{$src, $dst|$dst, $src}", + []>, TB; +let mayLoad = 1 in +def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem, + (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), + "movz{bl|x}\t{$src, $dst|$dst, $src}", + []>, TB; + +// MOVSX64rr8 always has a REX prefix and it has an 8-bit register +// operand, which makes it a rare instruction with an 8-bit register +// operand that can never access an h register. If support for h registers +// were generalized, this would require a special register class. +def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src), + "movs{bq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sext GR8:$src))]>, TB; +def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src), + "movs{bq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sextloadi64i8 addr:$src))]>, TB; +def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), + "movs{wq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sext GR16:$src))]>, TB; +def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "movs{wq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sextloadi64i16 addr:$src))]>, TB; +def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), + "movs{lq|xd}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sext GR32:$src))]>; +def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src), + "movs{lq|xd}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sextloadi64i32 addr:$src))]>; + +// movzbq and movzwq encodings for the disassembler +def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src), + "movz{bq|x}\t{$src, $dst|$dst, $src}", []>, TB; +def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src), + "movz{bq|x}\t{$src, $dst|$dst, $src}", []>, TB; +def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), + "movz{wq|x}\t{$src, $dst|$dst, $src}", []>, TB; +def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "movz{wq|x}\t{$src, $dst|$dst, $src}", []>, TB; + +// FIXME: These should be Pat patterns. +let isCodeGenOnly = 1 in { + +// Use movzbl instead of movzbq when the destination is a register; it's +// equivalent due to implicit zero-extending, and it has a smaller encoding. +def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src), + "", [(set GR64:$dst, (zext GR8:$src))]>, TB; +def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src), + "", [(set GR64:$dst, (zextloadi64i8 addr:$src))]>, TB; +// Use movzwl instead of movzwq when the destination is a register; it's +// equivalent due to implicit zero-extending, and it has a smaller encoding. +def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), + "", [(set GR64:$dst, (zext GR16:$src))]>, TB; +def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "", [(set GR64:$dst, (zextloadi64i16 addr:$src))]>, TB; + +// There's no movzlq instruction, but movl can be used for this purpose, using +// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero +// extension on x86-64 is to use a SUBREG_TO_REG to utilize implicit +// zero-extension, however this isn't possible when the 32-bit value is +// defined by a truncate or is copied from something where the high bits aren't +// necessarily all zero. In such cases, we fall back to these explicit zext +// instructions. +def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src), + "", [(set GR64:$dst, (zext GR32:$src))]>; +def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src), + "", [(set GR64:$dst, (zextloadi64i32 addr:$src))]>; + + +} + diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td new file mode 100644 index 0000000..d868773 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td @@ -0,0 +1,60 @@ +//====- X86InstrFMA.td - Describe the X86 Instruction Set --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes FMA (Fused Multiply-Add) instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// FMA3 - Intel 3 operand Fused Multiply-Add instructions +//===----------------------------------------------------------------------===// + +multiclass fma_rm<bits<8> opc, string OpcodeStr> { + def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>; + def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>; + def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>; + def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>; +} + +multiclass fma_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpcodeStr, string PackTy> { + defm r132 : fma_rm<opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy))>; + defm r213 : fma_rm<opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy))>; + defm r231 : fma_rm<opc231, !strconcat(OpcodeStr, !strconcat("231", PackTy))>; +} + +let isAsmParserOnly = 1 in { + // Fused Multiply-Add + defm VFMADDPS : fma_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps">; + defm VFMADDPD : fma_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd">, VEX_W; + defm VFMADDSUBPS : fma_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps">; + defm VFMADDSUBPD : fma_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd">, VEX_W; + defm VFMSUBADDPS : fma_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps">; + defm VFMSUBADDPD : fma_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd">, VEX_W; + defm VFMSUBPS : fma_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps">; + defm VFMSUBPD : fma_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd">, VEX_W; + + // Fused Negative Multiply-Add + defm VFNMADDPS : fma_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps">; + defm VFNMADDPD : fma_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd">, VEX_W; + defm VFNMSUBPS : fma_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps">; + defm VFNMSUBPD : fma_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd">, VEX_W; +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td new file mode 100644 index 0000000..7cb870f --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td @@ -0,0 +1,648 @@ +//==- X86InstrFPStack.td - Describe the X86 Instruction Set --*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 x87 FPU instruction set, defining the +// instructions, and properties of the instructions which are needed for code +// generation, machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// FPStack specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, + SDTCisVT<1, f80>]>; +def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; + +def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; + +def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86fst : SDNode<"X86ISD::FST", SDTX86Fst, + [SDNPHasChain, SDNPInGlue, SDNPMayStore, + SDNPMemOperand]>; +def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild, + [SDNPHasChain, SDNPOutGlue, SDNPMayLoad, + SDNPMemOperand]>; +def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore, + [SDNPHasChain, SDNPMayStore, SDNPSideEffect, + SDNPMemOperand]>; + +//===----------------------------------------------------------------------===// +// FPStack pattern fragments +//===----------------------------------------------------------------------===// + +def fpimm0 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + +def fpimmneg0 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(-0.0); +}]>; + +def fpimm1 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(+1.0); +}]>; + +def fpimmneg1 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(-1.0); +}]>; + +// Some 'special' instructions +let usesCustomInserter = 1 in { // Expanded after instruction selection. + def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src), + [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>; + def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src), + [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>; + def FP32_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP32:$src), + [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>; + def FP64_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP64:$src), + [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>; + def FP64_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP64:$src), + [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>; + def FP64_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP64:$src), + [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>; + def FP80_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP80:$src), + [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>; + def FP80_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP80:$src), + [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>; + def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src), + [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>; +} + +// All FP Stack operations are represented with four instructions here. The +// first three instructions, generated by the instruction selector, use "RFP32" +// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit, +// 64-bit or 80-bit floating point values. These sizes apply to the values, +// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be +// copied to each other without losing information. These instructions are all +// pseudo instructions and use the "_Fp" suffix. +// In some cases there are additional variants with a mixture of different +// register sizes. +// The second instruction is defined with FPI, which is the actual instruction +// emitted by the assembler. These use "RST" registers, although frequently +// the actual register(s) used are implicit. These are always 80 bits. +// The FP stackifier pass converts one to the other after register allocation +// occurs. +// +// Note that the FpI instruction should have instruction selection info (e.g. +// a pattern) and the FPI instruction should have emission info (e.g. opcode +// encoding and asm printing info). + +// Pseudo Instruction for FP stack return values. +def FpPOP_RETVAL : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>; + +// FpIf32, FpIf64 - Floating Point Pseudo Instruction template. +// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1. +// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2. +// f80 instructions cannot use SSE and use neither of these. +class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>; +class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>; + +// Factoring for arithmetic. +multiclass FPBinary_rr<SDNode OpNode> { +// Register op register -> register +// These are separated out because they have no reversed form. +def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP, + [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>; +def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP, + [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>; +def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP, + [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>; +} +// The FopST0 series are not included here because of the irregularities +// in where the 'r' goes in assembly output. +// These instructions cannot address 80-bit memory. +multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> { +// ST(0) = ST(0) + [mem] +def _Fp32m : FpIf32<(outs RFP32:$dst), + (ins RFP32:$src1, f32mem:$src2), OneArgFPRW, + [(set RFP32:$dst, + (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>; +def _Fp64m : FpIf64<(outs RFP64:$dst), + (ins RFP64:$src1, f64mem:$src2), OneArgFPRW, + [(set RFP64:$dst, + (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>; +def _Fp64m32: FpIf64<(outs RFP64:$dst), + (ins RFP64:$src1, f32mem:$src2), OneArgFPRW, + [(set RFP64:$dst, + (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>; +def _Fp80m32: FpI_<(outs RFP80:$dst), + (ins RFP80:$src1, f32mem:$src2), OneArgFPRW, + [(set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>; +def _Fp80m64: FpI_<(outs RFP80:$dst), + (ins RFP80:$src1, f64mem:$src2), OneArgFPRW, + [(set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>; +def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), + !strconcat("f", asmstring, "{s}\t$src")> { + let mayLoad = 1; +} +def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), + !strconcat("f", asmstring, "{l}\t$src")> { + let mayLoad = 1; +} +// ST(0) = ST(0) + [memint] +def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), + OneArgFPRW, + [(set RFP32:$dst, (OpNode RFP32:$src1, + (X86fild addr:$src2, i16)))]>; +def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), + OneArgFPRW, + [(set RFP32:$dst, (OpNode RFP32:$src1, + (X86fild addr:$src2, i32)))]>; +def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), + OneArgFPRW, + [(set RFP64:$dst, (OpNode RFP64:$src1, + (X86fild addr:$src2, i16)))]>; +def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), + OneArgFPRW, + [(set RFP64:$dst, (OpNode RFP64:$src1, + (X86fild addr:$src2, i32)))]>; +def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), + OneArgFPRW, + [(set RFP80:$dst, (OpNode RFP80:$src1, + (X86fild addr:$src2, i16)))]>; +def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), + OneArgFPRW, + [(set RFP80:$dst, (OpNode RFP80:$src1, + (X86fild addr:$src2, i32)))]>; +def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), + !strconcat("fi", asmstring, "{s}\t$src")> { + let mayLoad = 1; +} +def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), + !strconcat("fi", asmstring, "{l}\t$src")> { + let mayLoad = 1; +} +} + +defm ADD : FPBinary_rr<fadd>; +defm SUB : FPBinary_rr<fsub>; +defm MUL : FPBinary_rr<fmul>; +defm DIV : FPBinary_rr<fdiv>; +defm ADD : FPBinary<fadd, MRM0m, "add">; +defm SUB : FPBinary<fsub, MRM4m, "sub">; +defm SUBR: FPBinary<fsub ,MRM5m, "subr">; +defm MUL : FPBinary<fmul, MRM1m, "mul">; +defm DIV : FPBinary<fdiv, MRM6m, "div">; +defm DIVR: FPBinary<fdiv, MRM7m, "divr">; + +class FPST0rInst<bits<8> o, string asm> + : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, D8; +class FPrST0Inst<bits<8> o, string asm> + : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DC; +class FPrST0PInst<bits<8> o, string asm> + : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DE; + +// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion +// of some of the 'reverse' forms of the fsub and fdiv instructions. As such, +// we have to put some 'r's in and take them out of weird places. +def ADD_FST0r : FPST0rInst <0xC0, "fadd\t$op">; +def ADD_FrST0 : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, %ST(0)}">; +def ADD_FPrST0 : FPrST0PInst<0xC0, "faddp\t$op">; +def SUBR_FST0r : FPST0rInst <0xE8, "fsubr\t$op">; +def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, %ST(0)}">; +def SUB_FPrST0 : FPrST0PInst<0xE8, "fsub{r}p\t$op">; +def SUB_FST0r : FPST0rInst <0xE0, "fsub\t$op">; +def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, %ST(0)}">; +def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">; +def MUL_FST0r : FPST0rInst <0xC8, "fmul\t$op">; +def MUL_FrST0 : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, %ST(0)}">; +def MUL_FPrST0 : FPrST0PInst<0xC8, "fmulp\t$op">; +def DIVR_FST0r : FPST0rInst <0xF8, "fdivr\t$op">; +def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, %ST(0)}">; +def DIV_FPrST0 : FPrST0PInst<0xF8, "fdiv{r}p\t$op">; +def DIV_FST0r : FPST0rInst <0xF0, "fdiv\t$op">; +def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, %ST(0)}">; +def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">; + +def COM_FST0r : FPST0rInst <0xD0, "fcom\t$op">; +def COMP_FST0r : FPST0rInst <0xD8, "fcomp\t$op">; + +// Unary operations. +multiclass FPUnary<SDNode OpNode, bits<8> opcode, string asmstring> { +def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW, + [(set RFP32:$dst, (OpNode RFP32:$src))]>; +def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW, + [(set RFP64:$dst, (OpNode RFP64:$src))]>; +def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW, + [(set RFP80:$dst, (OpNode RFP80:$src))]>; +def _F : FPI<opcode, RawFrm, (outs), (ins), asmstring>, D9; +} + +defm CHS : FPUnary<fneg, 0xE0, "fchs">; +defm ABS : FPUnary<fabs, 0xE1, "fabs">; +defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">; +defm SIN : FPUnary<fsin, 0xFE, "fsin">; +defm COS : FPUnary<fcos, 0xFF, "fcos">; + +let neverHasSideEffects = 1 in { +def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>; +def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>; +def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>; +} +def TST_F : FPI<0xE4, RawFrm, (outs), (ins), "ftst">, D9; + +// Versions of FP instructions that take a single memory operand. Added for the +// disassembler; remove as they are included with patterns elsewhere. +def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">; +def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">; + +def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">; +def FSTENVm : FPI<0xD9, MRM6m, (outs f32mem:$dst), (ins), "fnstenv\t$dst">; + +def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">; +def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">; + +def FCOM64m : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">; +def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">; + +def FRSTORm : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">; +def FSAVEm : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">; +def FNSTSWm : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fnstsw\t$dst">; + +def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">; +def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">; + +def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f32mem:$src), "fbld\t$src">; +def FBSTPm : FPI<0xDF, MRM6m, (outs f32mem:$dst), (ins), "fbstp\t$dst">; + +// Floating point cmovs. +class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>; +class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>; + +multiclass FPCMov<PatLeaf cc> { + def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), + CondMovFP, + [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2, + cc, EFLAGS))]>; + def _Fp64 : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), + CondMovFP, + [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2, + cc, EFLAGS))]>; + def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), + CondMovFP, + [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2, + cc, EFLAGS))]>, + Requires<[HasCMov]>; +} + +let Uses = [EFLAGS], Constraints = "$src1 = $dst" in { +defm CMOVB : FPCMov<X86_COND_B>; +defm CMOVBE : FPCMov<X86_COND_BE>; +defm CMOVE : FPCMov<X86_COND_E>; +defm CMOVP : FPCMov<X86_COND_P>; +defm CMOVNB : FPCMov<X86_COND_AE>; +defm CMOVNBE: FPCMov<X86_COND_A>; +defm CMOVNE : FPCMov<X86_COND_NE>; +defm CMOVNP : FPCMov<X86_COND_NP>; +} // Uses = [EFLAGS], Constraints = "$src1 = $dst" + +let Predicates = [HasCMov] in { +// These are not factored because there's no clean way to pass DA/DB. +def CMOVB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins), + "fcmovb\t{$op, %st(0)|%ST(0), $op}">, DA; +def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins), + "fcmovbe\t{$op, %st(0)|%ST(0), $op}">, DA; +def CMOVE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins), + "fcmove\t{$op, %st(0)|%ST(0), $op}">, DA; +def CMOVP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins), + "fcmovu\t {$op, %st(0)|%ST(0), $op}">, DA; +def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins), + "fcmovnb\t{$op, %st(0)|%ST(0), $op}">, DB; +def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins), + "fcmovnbe\t{$op, %st(0)|%ST(0), $op}">, DB; +def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins), + "fcmovne\t{$op, %st(0)|%ST(0), $op}">, DB; +def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins), + "fcmovnu\t{$op, %st(0)|%ST(0), $op}">, DB; +} // Predicates = [HasCMov] + +// Floating point loads & stores. +let canFoldAsLoad = 1 in { +def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP, + [(set RFP32:$dst, (loadf32 addr:$src))]>; +let isReMaterializable = 1 in + def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP, + [(set RFP64:$dst, (loadf64 addr:$src))]>; +def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP, + [(set RFP80:$dst, (loadf80 addr:$src))]>; +} +def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP, + [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>; +def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP, + [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>; +def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP, + [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>; +def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP, + [(set RFP32:$dst, (X86fild addr:$src, i16))]>; +def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP, + [(set RFP32:$dst, (X86fild addr:$src, i32))]>; +def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP, + [(set RFP32:$dst, (X86fild addr:$src, i64))]>; +def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP, + [(set RFP64:$dst, (X86fild addr:$src, i16))]>; +def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP, + [(set RFP64:$dst, (X86fild addr:$src, i32))]>; +def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP, + [(set RFP64:$dst, (X86fild addr:$src, i64))]>; +def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP, + [(set RFP80:$dst, (X86fild addr:$src, i16))]>; +def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP, + [(set RFP80:$dst, (X86fild addr:$src, i32))]>; +def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP, + [(set RFP80:$dst, (X86fild addr:$src, i64))]>; + +def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, + [(store RFP32:$src, addr:$op)]>; +def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, + [(truncstoref32 RFP64:$src, addr:$op)]>; +def ST_Fp64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, + [(store RFP64:$src, addr:$op)]>; +def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, + [(truncstoref32 RFP80:$src, addr:$op)]>; +def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, + [(truncstoref64 RFP80:$src, addr:$op)]>; +// FST does not support 80-bit memory target; FSTP must be used. + +let mayStore = 1, neverHasSideEffects = 1 in { +def ST_FpP32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>; +def ST_FpP64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>; +def ST_FpP64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>; +def ST_FpP80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>; +def ST_FpP80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>; +} +def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP, + [(store RFP80:$src, addr:$op)]>; +let mayStore = 1, neverHasSideEffects = 1 in { +def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp16m64 : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp32m64 : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>; +def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>; +def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>; +} + +let mayLoad = 1 in { +def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">; +def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">; +def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">; +def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">; +def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">; +def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">; +} +let mayStore = 1 in { +def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">; +def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">; +def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">; +def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst">; +def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst">; +def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst">; +def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst">; +def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst">; +def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst">; +def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">; +} + +// FISTTP requires SSE3 even though it's a FPStack op. +def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, + [(X86fp_to_i16mem RFP32:$src, addr:$op)]>, + Requires<[HasSSE3]>; +def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, + [(X86fp_to_i32mem RFP32:$src, addr:$op)]>, + Requires<[HasSSE3]>; +def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, + [(X86fp_to_i64mem RFP32:$src, addr:$op)]>, + Requires<[HasSSE3]>; +def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, + [(X86fp_to_i16mem RFP64:$src, addr:$op)]>, + Requires<[HasSSE3]>; +def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, + [(X86fp_to_i32mem RFP64:$src, addr:$op)]>, + Requires<[HasSSE3]>; +def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, + [(X86fp_to_i64mem RFP64:$src, addr:$op)]>, + Requires<[HasSSE3]>; +def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, + [(X86fp_to_i16mem RFP80:$src, addr:$op)]>, + Requires<[HasSSE3]>; +def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, + [(X86fp_to_i32mem RFP80:$src, addr:$op)]>, + Requires<[HasSSE3]>; +def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, + [(X86fp_to_i64mem RFP80:$src, addr:$op)]>, + Requires<[HasSSE3]>; + +let mayStore = 1 in { +def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">; +def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">; +def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), + "fisttp{ll}\t$dst">; +} + +// FP Stack manipulation instructions. +def LD_Frr : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op">, D9; +def ST_Frr : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op">, DD; +def ST_FPrr : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op">, DD; +def XCH_F : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op">, D9; + +// Floating point constant loads. +let isReMaterializable = 1 in { +def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, + [(set RFP32:$dst, fpimm0)]>; +def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, + [(set RFP32:$dst, fpimm1)]>; +def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP, + [(set RFP64:$dst, fpimm0)]>; +def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP, + [(set RFP64:$dst, fpimm1)]>; +def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP, + [(set RFP80:$dst, fpimm0)]>; +def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP, + [(set RFP80:$dst, fpimm1)]>; +} + +def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz">, D9; +def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1">, D9; + + +// Floating point compares. +let Defs = [EFLAGS] in { +def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, + []>; // FPSW = cmp ST(0) with ST(i) +def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, + []>; // FPSW = cmp ST(0) with ST(i) +def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, + []>; // FPSW = cmp ST(0) with ST(i) + +// CC = ST(0) cmp ST(i) +def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, + [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>; +def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, + [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>; +def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, + [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>; +} + +let Defs = [EFLAGS], Uses = [ST0] in { +def UCOM_Fr : FPI<0xE0, AddRegFrm, // FPSW = cmp ST(0) with ST(i) + (outs), (ins RST:$reg), + "fucom\t$reg">, DD; +def UCOM_FPr : FPI<0xE8, AddRegFrm, // FPSW = cmp ST(0) with ST(i), pop + (outs), (ins RST:$reg), + "fucomp\t$reg">, DD; +def UCOM_FPPr : FPI<0xE9, RawFrm, // cmp ST(0) with ST(1), pop, pop + (outs), (ins), + "fucompp">, DA; + +def UCOM_FIr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i) + (outs), (ins RST:$reg), + "fucomi\t$reg">, DB; +def UCOM_FIPr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i), pop + (outs), (ins RST:$reg), + "fucompi\t$reg">, DF; +} + +def COM_FIr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg), + "fcomi\t$reg">, DB; +def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg), + "fcompi\t$reg">, DF; + +// Floating point flag ops. +let Defs = [AX] in +def FNSTSW8r : I<0xE0, RawFrm, // AX = fp flags + (outs), (ins), "fnstsw %ax", []>, DF; + +def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world + (outs), (ins i16mem:$dst), "fnstcw\t$dst", + [(X86fp_cwd_get16 addr:$dst)]>; + +let mayLoad = 1 in +def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] + (outs), (ins i16mem:$dst), "fldcw\t$dst", []>; + +// FPU control instructions +def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", []>, DB; +def FFREE : FPI<0xC0, AddRegFrm, (outs), (ins RST:$reg), + "ffree\t$reg">, DD; + +// Clear exceptions + +def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", []>, DB; + +// Operandless floating-point instructions for the disassembler. +def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>; + +def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", []>, D9; +def FXAM : I<0xE5, RawFrm, (outs), (ins), "fxam", []>, D9; +def FLDL2T : I<0xE9, RawFrm, (outs), (ins), "fldl2t", []>, D9; +def FLDL2E : I<0xEA, RawFrm, (outs), (ins), "fldl2e", []>, D9; +def FLDPI : I<0xEB, RawFrm, (outs), (ins), "fldpi", []>, D9; +def FLDLG2 : I<0xEC, RawFrm, (outs), (ins), "fldlg2", []>, D9; +def FLDLN2 : I<0xED, RawFrm, (outs), (ins), "fldln2", []>, D9; +def F2XM1 : I<0xF0, RawFrm, (outs), (ins), "f2xm1", []>, D9; +def FYL2X : I<0xF1, RawFrm, (outs), (ins), "fyl2x", []>, D9; +def FPTAN : I<0xF2, RawFrm, (outs), (ins), "fptan", []>, D9; +def FPATAN : I<0xF3, RawFrm, (outs), (ins), "fpatan", []>, D9; +def FXTRACT : I<0xF4, RawFrm, (outs), (ins), "fxtract", []>, D9; +def FPREM1 : I<0xF5, RawFrm, (outs), (ins), "fprem1", []>, D9; +def FDECSTP : I<0xF6, RawFrm, (outs), (ins), "fdecstp", []>, D9; +def FINCSTP : I<0xF7, RawFrm, (outs), (ins), "fincstp", []>, D9; +def FPREM : I<0xF8, RawFrm, (outs), (ins), "fprem", []>, D9; +def FYL2XP1 : I<0xF9, RawFrm, (outs), (ins), "fyl2xp1", []>, D9; +def FSINCOS : I<0xFB, RawFrm, (outs), (ins), "fsincos", []>, D9; +def FRNDINT : I<0xFC, RawFrm, (outs), (ins), "frndint", []>, D9; +def FSCALE : I<0xFD, RawFrm, (outs), (ins), "fscale", []>, D9; +def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", []>, DE; + +def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), + "fxsave\t$dst", []>, TB; +def FXSAVE64 : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), + "fxsaveq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>; +def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), + "fxrstor\t$src", []>, TB; +def FXRSTOR64 : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), + "fxrstorq\t$src", []>, TB, REX_W, Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// Required for RET of f32 / f64 / f80 values. +def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>; +def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>; +def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>; + +// Required for CALL which return f32 / f64 / f80 values. +def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>; +def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, + RFP64:$src)>; +def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>; +def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, + RFP80:$src)>; +def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, + RFP80:$src)>; +def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op, + RFP80:$src)>; + +// Floating point constant -0.0 and -1.0 +def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>; +def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>; +def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>; +def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>; +def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>; +def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>; + +// Used to conv. i64 to f64 since there isn't a SSE version. +def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>; + +// FP extensions map onto simple pseudo-value conversions if they are to/from +// the FP stack. +def : Pat<(f64 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>, + Requires<[FPStackf32]>; +def : Pat<(f80 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>, + Requires<[FPStackf32]>; +def : Pat<(f80 (fextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>, + Requires<[FPStackf64]>; + +// FP truncations map onto simple pseudo-value conversions if they are to/from +// the FP stack. We have validated that only value-preserving truncations make +// it through isel. +def : Pat<(f32 (fround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>, + Requires<[FPStackf32]>; +def : Pat<(f32 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>, + Requires<[FPStackf32]>; +def : Pat<(f64 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>, + Requires<[FPStackf64]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td new file mode 100644 index 0000000..0a1590b --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td @@ -0,0 +1,541 @@ +//===- X86InstrFormats.td - X86 Instruction Formats --------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// X86 Instruction Format Definitions. +// + +// Format specifies the encoding used by the instruction. This is part of the +// ad-hoc solution used to emit machine instruction encodings by our machine +// code emitter. +class Format<bits<6> val> { + bits<6> Value = val; +} + +def Pseudo : Format<0>; def RawFrm : Format<1>; +def AddRegFrm : Format<2>; def MRMDestReg : Format<3>; +def MRMDestMem : Format<4>; def MRMSrcReg : Format<5>; +def MRMSrcMem : Format<6>; +def MRM0r : Format<16>; def MRM1r : Format<17>; def MRM2r : Format<18>; +def MRM3r : Format<19>; def MRM4r : Format<20>; def MRM5r : Format<21>; +def MRM6r : Format<22>; def MRM7r : Format<23>; +def MRM0m : Format<24>; def MRM1m : Format<25>; def MRM2m : Format<26>; +def MRM3m : Format<27>; def MRM4m : Format<28>; def MRM5m : Format<29>; +def MRM6m : Format<30>; def MRM7m : Format<31>; +def MRMInitReg : Format<32>; +def MRM_C1 : Format<33>; +def MRM_C2 : Format<34>; +def MRM_C3 : Format<35>; +def MRM_C4 : Format<36>; +def MRM_C8 : Format<37>; +def MRM_C9 : Format<38>; +def MRM_E8 : Format<39>; +def MRM_F0 : Format<40>; +def MRM_F8 : Format<41>; +def MRM_F9 : Format<42>; +def RawFrmImm8 : Format<43>; +def RawFrmImm16 : Format<44>; +def MRM_D0 : Format<45>; +def MRM_D1 : Format<46>; + +// ImmType - This specifies the immediate type used by an instruction. This is +// part of the ad-hoc solution used to emit machine instruction encodings by our +// machine code emitter. +class ImmType<bits<3> val> { + bits<3> Value = val; +} +def NoImm : ImmType<0>; +def Imm8 : ImmType<1>; +def Imm8PCRel : ImmType<2>; +def Imm16 : ImmType<3>; +def Imm16PCRel : ImmType<4>; +def Imm32 : ImmType<5>; +def Imm32PCRel : ImmType<6>; +def Imm64 : ImmType<7>; + +// FPFormat - This specifies what form this FP instruction has. This is used by +// the Floating-Point stackifier pass. +class FPFormat<bits<3> val> { + bits<3> Value = val; +} +def NotFP : FPFormat<0>; +def ZeroArgFP : FPFormat<1>; +def OneArgFP : FPFormat<2>; +def OneArgFPRW : FPFormat<3>; +def TwoArgFP : FPFormat<4>; +def CompareFP : FPFormat<5>; +def CondMovFP : FPFormat<6>; +def SpecialFP : FPFormat<7>; + +// Class specifying the SSE execution domain, used by the SSEDomainFix pass. +// Keep in sync with tables in X86InstrInfo.cpp. +class Domain<bits<2> val> { + bits<2> Value = val; +} +def GenericDomain : Domain<0>; +def SSEPackedSingle : Domain<1>; +def SSEPackedDouble : Domain<2>; +def SSEPackedInt : Domain<3>; + +// Prefix byte classes which are used to indicate to the ad-hoc machine code +// emitter that various prefix bytes are required. +class OpSize { bit hasOpSizePrefix = 1; } +class AdSize { bit hasAdSizePrefix = 1; } +class REX_W { bit hasREX_WPrefix = 1; } +class LOCK { bit hasLockPrefix = 1; } +class SegFS { bits<2> SegOvrBits = 1; } +class SegGS { bits<2> SegOvrBits = 2; } +class TB { bits<5> Prefix = 1; } +class REP { bits<5> Prefix = 2; } +class D8 { bits<5> Prefix = 3; } +class D9 { bits<5> Prefix = 4; } +class DA { bits<5> Prefix = 5; } +class DB { bits<5> Prefix = 6; } +class DC { bits<5> Prefix = 7; } +class DD { bits<5> Prefix = 8; } +class DE { bits<5> Prefix = 9; } +class DF { bits<5> Prefix = 10; } +class XD { bits<5> Prefix = 11; } +class XS { bits<5> Prefix = 12; } +class T8 { bits<5> Prefix = 13; } +class TA { bits<5> Prefix = 14; } +class A6 { bits<5> Prefix = 15; } +class A7 { bits<5> Prefix = 16; } +class TF { bits<5> Prefix = 17; } +class VEX { bit hasVEXPrefix = 1; } +class VEX_W { bit hasVEX_WPrefix = 1; } +class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; } +class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; } +class VEX_L { bit hasVEX_L = 1; } +class VEX_LIG { bit ignoresVEX_L = 1; } +class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; } + +class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, + string AsmStr, Domain d = GenericDomain> + : Instruction { + let Namespace = "X86"; + + bits<8> Opcode = opcod; + Format Form = f; + bits<6> FormBits = Form.Value; + ImmType ImmT = i; + + dag OutOperandList = outs; + dag InOperandList = ins; + string AsmString = AsmStr; + + // If this is a pseudo instruction, mark it isCodeGenOnly. + let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo"); + + // + // Attributes specific to X86 instructions... + // + bit hasOpSizePrefix = 0; // Does this inst have a 0x66 prefix? + bit hasAdSizePrefix = 0; // Does this inst have a 0x67 prefix? + + bits<5> Prefix = 0; // Which prefix byte does this inst have? + bit hasREX_WPrefix = 0; // Does this inst require the REX.W prefix? + FPFormat FPForm = NotFP; // What flavor of FP instruction is this? + bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix? + bits<2> SegOvrBits = 0; // Segment override prefix. + Domain ExeDomain = d; + bit hasVEXPrefix = 0; // Does this inst require a VEX prefix? + bit hasVEX_WPrefix = 0; // Does this inst set the VEX_W field? + bit hasVEX_4VPrefix = 0; // Does this inst require the VEX.VVVV field? + bit hasVEX_i8ImmReg = 0; // Does this inst require the last source register + // to be encoded in a immediate field? + bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? + bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit + bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding? + + // TSFlags layout should be kept in sync with X86InstrInfo.h. + let TSFlags{5-0} = FormBits; + let TSFlags{6} = hasOpSizePrefix; + let TSFlags{7} = hasAdSizePrefix; + let TSFlags{12-8} = Prefix; + let TSFlags{13} = hasREX_WPrefix; + let TSFlags{16-14} = ImmT.Value; + let TSFlags{19-17} = FPForm.Value; + let TSFlags{20} = hasLockPrefix; + let TSFlags{22-21} = SegOvrBits; + let TSFlags{24-23} = ExeDomain.Value; + let TSFlags{32-25} = Opcode; + let TSFlags{33} = hasVEXPrefix; + let TSFlags{34} = hasVEX_WPrefix; + let TSFlags{35} = hasVEX_4VPrefix; + let TSFlags{36} = hasVEX_i8ImmReg; + let TSFlags{37} = hasVEX_L; + let TSFlags{38} = ignoresVEX_L; + let TSFlags{39} = has3DNow0F0FOpcode; +} + +class PseudoI<dag oops, dag iops, list<dag> pattern> + : X86Inst<0, Pseudo, NoImm, oops, iops, ""> { + let Pattern = pattern; +} + +class I<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, Domain d = GenericDomain> + : X86Inst<o, f, NoImm, outs, ins, asm, d> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, Domain d = GenericDomain> + : X86Inst<o, f, Imm8, outs, ins, asm, d> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> + : X86Inst<o, f, Imm8PCRel, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> + : X86Inst<o, f, Imm16, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> + : X86Inst<o, f, Imm32, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} + +class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> + : X86Inst<o, f, Imm16PCRel, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} + +class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> + : X86Inst<o, f, Imm32PCRel, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} + +// FPStack Instruction Templates: +// FPI - Floating Point Instruction template. +class FPI<bits<8> o, Format F, dag outs, dag ins, string asm> + : I<o, F, outs, ins, asm, []> {} + +// FpI_ - Floating Point Pseudo Instruction template. Not Predicated. +class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern> + : X86Inst<0, Pseudo, NoImm, outs, ins, ""> { + let FPForm = fp; + let Pattern = pattern; +} + +// Templates for instructions that use a 16- or 32-bit segmented address as +// their only operand: lcall (FAR CALL) and ljmp (FAR JMP) +// +// Iseg16 - 16-bit segment selector, 16-bit offset +// Iseg32 - 16-bit segment selector, 32-bit offset + +class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> : X86Inst<o, f, Imm16, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} + +class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> : X86Inst<o, f, Imm32, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} + +// SI - SSE 1 & 2 scalar instructions +class SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> + : I<o, F, outs, ins, asm, pattern> { + let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], + !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2])); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); +} + +// SIi8 - SSE 1 & 2 scalar instructions +class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern> { + let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], + !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2])); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); +} + +// PI - SSE 1 & 2 packed instructions +class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, + Domain d> + : I<o, F, outs, ins, asm, pattern, d> { + let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX], + !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1])); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm); +} + +// PIi8 - SSE 1 & 2 packed instructions with immediate +class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, Domain d> + : Ii8<o, F, outs, ins, asm, pattern, d> { + let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX], + !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1])); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm); +} + +// SSE1 Instruction Templates: +// +// SSI - SSE1 instructions with XS prefix. +// PSI - SSE1 instructions with TB prefix. +// PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix. +// VSSI - SSE1 instructions with XS prefix in AVX form. +// VPSI - SSE1 instructions with TB prefix in AVX form. + +class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>; +class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>; +class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB, + Requires<[HasSSE1]>; +class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB, + Requires<[HasSSE1]>; +class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS, + Requires<[HasAVX]>; +class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>, TB, + Requires<[HasAVX]>; + +// SSE2 Instruction Templates: +// +// SDI - SSE2 instructions with XD prefix. +// SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix. +// SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix. +// PDI - SSE2 instructions with TB and OpSize prefixes. +// PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes. +// VSDI - SSE2 instructions with XD prefix in AVX form. +// VPDI - SSE2 instructions with TB and OpSize prefixes in AVX form. + +class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE2]>; +class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE2]>; +class SSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>; +class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize, + Requires<[HasSSE2]>; +class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize, + Requires<[HasSSE2]>; +class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XD, + Requires<[HasAVX]>; +class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedDouble>, TB, + OpSize, Requires<[HasAVX]>; + +// SSE3 Instruction Templates: +// +// S3I - SSE3 instructions with TB and OpSize prefixes. +// S3SI - SSE3 instructions with XS prefix. +// S3DI - SSE3 instructions with XD prefix. + +class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, XS, + Requires<[HasSSE3]>; +class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, XD, + Requires<[HasSSE3]>; +class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, OpSize, + Requires<[HasSSE3]>; + + +// SSSE3 Instruction Templates: +// +// SS38I - SSSE3 instructions with T8 prefix. +// SS3AI - SSSE3 instructions with TA prefix. +// +// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version +// uses the MMX registers. We put those instructions here because they better +// fit into the SSSE3 instruction category rather than the MMX category. + +class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, + Requires<[HasSSSE3]>; +class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + Requires<[HasSSSE3]>; + +// SSE4.1 Instruction Templates: +// +// SS48I - SSE 4.1 instructions with T8 prefix. +// SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8. +// +class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, + Requires<[HasSSE41]>; +class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + Requires<[HasSSE41]>; + +// SSE4.2 Instruction Templates: +// +// SS428I - SSE 4.2 instructions with T8 prefix. +class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, + Requires<[HasSSE42]>; + +// SS42FI - SSE 4.2 instructions with TF prefix. +class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, TF, Requires<[HasSSE42]>; + +// SS42AI = SSE 4.2 instructions with TA prefix +class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + Requires<[HasSSE42]>; + +// AVX Instruction Templates: +// Instructions introduced in AVX (no SSE equivalent forms) +// +// AVX8I - AVX instructions with T8 and OpSize prefix. +// AVXAIi8 - AVX instructions with TA, OpSize prefix and ImmT = Imm8. +class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, OpSize, + Requires<[HasAVX]>; +class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, OpSize, + Requires<[HasAVX]>; + +// AES Instruction Templates: +// +// AES8I +// These use the same encoding as the SSE4.2 T8 and TA encodings. +class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, + Requires<[HasAES]>; + +class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + Requires<[HasAES]>; + +// CLMUL Instruction Templates +class CLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + OpSize, Requires<[HasCLMUL]>; + +class AVXCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + OpSize, VEX_4V, Requires<[HasAVX, HasCLMUL]>; + +// FMA3 Instruction Templates +class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, + OpSize, VEX_4V, Requires<[HasFMA3]>; + +// X86-64 Instruction templates... +// + +class RI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, REX_W; +class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, REX_W; +class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii32<o, F, outs, ins, asm, pattern>, REX_W; + +class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> + : X86Inst<o, f, Imm64, outs, ins, asm>, REX_W { + let Pattern = pattern; + let CodeSize = 3; +} + +class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : SSI<o, F, outs, ins, asm, pattern>, REX_W; +class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : SDI<o, F, outs, ins, asm, pattern>, REX_W; +class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : PDI<o, F, outs, ins, asm, pattern>, REX_W; +class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : VPDI<o, F, outs, ins, asm, pattern>, VEX_W; + +// MMX Instruction templates +// + +// MMXI - MMX instructions with TB prefix. +// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode. +// MMX2I - MMX / SSE2 instructions with TB and OpSize prefixes. +// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix. +// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix. +// MMXID - MMX instructions with XD prefix. +// MMXIS - MMX instructions with XS prefix. +class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>; +class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX,In64BitMode]>; +class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, TB, REX_W, Requires<[HasMMX]>; +class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasMMX]>; +class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>; +class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX]>; +class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td new file mode 100644 index 0000000..af919fb --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -0,0 +1,467 @@ +//======- X86InstrFragmentsSIMD.td - x86 ISA -------------*- tablegen -*-=====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides pattern fragments useful for SIMD instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// MMX Pattern Fragments +//===----------------------------------------------------------------------===// + +def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>; +def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>; + +//===----------------------------------------------------------------------===// +// SSE specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, + SDTCisFP<0>, SDTCisInt<2> ]>; +def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, + SDTCisFP<1>, SDTCisVT<3, i8>]>; + +def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>; +def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>; +def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; +def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; +def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; +def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>; +def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>; +def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; +def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; +def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; +def X86cmpss : SDNode<"X86ISD::FSETCCss", SDTX86Cmpss>; +def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>; +def X86pshufb : SDNode<"X86ISD::PSHUFB", + SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86andnp : SDNode<"X86ISD::ANDNP", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86psignb : SDNode<"X86ISD::PSIGNB", + SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86psignw : SDNode<"X86ISD::PSIGNW", + SDTypeProfile<1, 2, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86psignd : SDNode<"X86ISD::PSIGND", + SDTypeProfile<1, 2, [SDTCisVT<0, v4i32>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86pextrb : SDNode<"X86ISD::PEXTRB", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; +def X86pextrw : SDNode<"X86ISD::PEXTRW", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; +def X86pinsrb : SDNode<"X86ISD::PINSRB", + SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, + SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; +def X86pinsrw : SDNode<"X86ISD::PINSRW", + SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>, + SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; +def X86insrtps : SDNode<"X86ISD::INSERTPS", + SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>, + SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>; +def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", + SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; +def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86vshl : SDNode<"X86ISD::VSHL", SDTIntShiftOp>; +def X86vshr : SDNode<"X86ISD::VSRL", SDTIntShiftOp>; +def X86cmpps : SDNode<"X86ISD::CMPPS", SDTX86VFCMP>; +def X86cmppd : SDNode<"X86ISD::CMPPD", SDTX86VFCMP>; +def X86pcmpeqb : SDNode<"X86ISD::PCMPEQB", SDTIntBinOp, [SDNPCommutative]>; +def X86pcmpeqw : SDNode<"X86ISD::PCMPEQW", SDTIntBinOp, [SDNPCommutative]>; +def X86pcmpeqd : SDNode<"X86ISD::PCMPEQD", SDTIntBinOp, [SDNPCommutative]>; +def X86pcmpeqq : SDNode<"X86ISD::PCMPEQQ", SDTIntBinOp, [SDNPCommutative]>; +def X86pcmpgtb : SDNode<"X86ISD::PCMPGTB", SDTIntBinOp>; +def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>; +def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>; +def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>; + +def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCisVec<1>, + SDTCisSameAs<2, 1>]>; +def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; +def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; + +// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get +// translated into one of the target nodes below during lowering. +// Note: this is a work in progress... +def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; +def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>; + +def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisSameAs<0,1>, SDTCisInt<2>]>; +def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisInt<3>]>; + +def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; + +def X86PAlign : SDNode<"X86ISD::PALIGN", SDTShuff3OpI>; + +def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>; +def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>; +def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>; + +def X86Shufpd : SDNode<"X86ISD::SHUFPD", SDTShuff3OpI>; +def X86Shufps : SDNode<"X86ISD::SHUFPS", SDTShuff3OpI>; + +def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>; +def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>; +def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>; + +def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>; +def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>; + +def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>; +def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>; +def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>; +def X86Movhlpd : SDNode<"X86ISD::MOVHLPD", SDTShuff2Op>; + +def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>; +def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; + +def X86Unpcklps : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>; +def X86Unpcklpd : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>; +def X86Unpcklpsy : SDNode<"X86ISD::VUNPCKLPSY", SDTShuff2Op>; +def X86Unpcklpdy : SDNode<"X86ISD::VUNPCKLPDY", SDTShuff2Op>; + +def X86Unpckhps : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>; +def X86Unpckhpd : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>; +def X86Unpckhpsy : SDNode<"X86ISD::VUNPCKHPSY", SDTShuff2Op>; +def X86Unpckhpdy : SDNode<"X86ISD::VUNPCKHPDY", SDTShuff2Op>; + +def X86Punpcklbw : SDNode<"X86ISD::PUNPCKLBW", SDTShuff2Op>; +def X86Punpcklwd : SDNode<"X86ISD::PUNPCKLWD", SDTShuff2Op>; +def X86Punpckldq : SDNode<"X86ISD::PUNPCKLDQ", SDTShuff2Op>; +def X86Punpcklqdq : SDNode<"X86ISD::PUNPCKLQDQ", SDTShuff2Op>; + +def X86Punpckhbw : SDNode<"X86ISD::PUNPCKHBW", SDTShuff2Op>; +def X86Punpckhwd : SDNode<"X86ISD::PUNPCKHWD", SDTShuff2Op>; +def X86Punpckhdq : SDNode<"X86ISD::PUNPCKHDQ", SDTShuff2Op>; +def X86Punpckhqdq : SDNode<"X86ISD::PUNPCKHQDQ", SDTShuff2Op>; + +def X86VPermilps : SDNode<"X86ISD::VPERMILPS", SDTShuff2OpI>; +def X86VPermilpsy : SDNode<"X86ISD::VPERMILPSY", SDTShuff2OpI>; +def X86VPermilpd : SDNode<"X86ISD::VPERMILPD", SDTShuff2OpI>; +def X86VPermilpdy : SDNode<"X86ISD::VPERMILPDY", SDTShuff2OpI>; + +def X86VPerm2f128 : SDNode<"X86ISD::VPERM2F128", SDTShuff3OpI>; + +def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; + +//===----------------------------------------------------------------------===// +// SSE Complex Patterns +//===----------------------------------------------------------------------===// + +// These are 'extloads' from a scalar to the low element of a vector, zeroing +// the top elements. These are used for the SSE 'ss' and 'sd' instruction +// forms. +def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [], + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, + SDNPWantRoot]>; +def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [], + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, + SDNPWantRoot]>; + +def ssmem : Operand<v4f32> { + let PrintMethod = "printf32mem"; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm); + let ParserMatchClass = X86MemAsmOperand; + let OperandType = "OPERAND_MEMORY"; +} +def sdmem : Operand<v2f64> { + let PrintMethod = "printf64mem"; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm); + let ParserMatchClass = X86MemAsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +//===----------------------------------------------------------------------===// +// SSE pattern fragments +//===----------------------------------------------------------------------===// + +// 128-bit load pattern fragments +def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>; +def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>; +def loadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>; +def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>; + +// 256-bit load pattern fragments +def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; +def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; +def loadv8i32 : PatFrag<(ops node:$ptr), (v8i32 (load node:$ptr))>; +def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; + +// Like 'store', but always requires 128-bit vector alignment. +def alignedstore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 16; +}]>; + +// Like 'store', but always requires 256-bit vector alignment. +def alignedstore256 : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAlignment() >= 32; +}]>; + +// Like 'load', but always requires 128-bit vector alignment. +def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 16; +}]>; + +// Like 'load', but always requires 256-bit vector alignment. +def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 32; +}]>; + +def alignedloadfsf32 : PatFrag<(ops node:$ptr), + (f32 (alignedload node:$ptr))>; +def alignedloadfsf64 : PatFrag<(ops node:$ptr), + (f64 (alignedload node:$ptr))>; + +// 128-bit aligned load pattern fragments +def alignedloadv4f32 : PatFrag<(ops node:$ptr), + (v4f32 (alignedload node:$ptr))>; +def alignedloadv2f64 : PatFrag<(ops node:$ptr), + (v2f64 (alignedload node:$ptr))>; +def alignedloadv4i32 : PatFrag<(ops node:$ptr), + (v4i32 (alignedload node:$ptr))>; +def alignedloadv2i64 : PatFrag<(ops node:$ptr), + (v2i64 (alignedload node:$ptr))>; + +// 256-bit aligned load pattern fragments +def alignedloadv8f32 : PatFrag<(ops node:$ptr), + (v8f32 (alignedload256 node:$ptr))>; +def alignedloadv4f64 : PatFrag<(ops node:$ptr), + (v4f64 (alignedload256 node:$ptr))>; +def alignedloadv8i32 : PatFrag<(ops node:$ptr), + (v8i32 (alignedload256 node:$ptr))>; +def alignedloadv4i64 : PatFrag<(ops node:$ptr), + (v4i64 (alignedload256 node:$ptr))>; + +// Like 'load', but uses special alignment checks suitable for use in +// memory operands in most SSE instructions, which are required to +// be naturally aligned on some targets but not on others. If the subtarget +// allows unaligned accesses, match any load, though this may require +// setting a feature bit in the processor (on startup, for example). +// Opteron 10h and later implement such a feature. +def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return Subtarget->hasVectorUAMem() + || cast<LoadSDNode>(N)->getAlignment() >= 16; +}]>; + +def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>; +def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>; + +// 128-bit memop pattern fragments +def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; +def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>; +def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>; +def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; +def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop node:$ptr))>; +def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>; + +// 256-bit memop pattern fragments +def memopv32i8 : PatFrag<(ops node:$ptr), (v32i8 (memop node:$ptr))>; +def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>; +def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>; +def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>; +def memopv8i32 : PatFrag<(ops node:$ptr), (v8i32 (memop node:$ptr))>; + +// SSSE3 uses MMX registers for some instructions. They aren't aligned on a +// 16-byte boundary. +// FIXME: 8 byte alignment for mmx reads is not required +def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return cast<LoadSDNode>(N)->getAlignment() >= 8; +}]>; + +def memopmmx : PatFrag<(ops node:$ptr), (x86mmx (memop64 node:$ptr))>; + +// MOVNT Support +// Like 'store', but requires the non-temporal bit to be set +def nontemporalstore : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) + return ST->isNonTemporal(); + return false; +}]>; + +def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) + return ST->isNonTemporal() && !ST->isTruncatingStore() && + ST->getAddressingMode() == ISD::UNINDEXED && + ST->getAlignment() >= 16; + return false; +}]>; + +def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), + (st node:$val, node:$ptr), [{ + if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) + return ST->isNonTemporal() && + ST->getAlignment() < 16; + return false; +}]>; + +// 128-bit bitconvert pattern fragments +def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>; +def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>; +def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>; +def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>; +def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>; +def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>; + +// 256-bit bitconvert pattern fragments +def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>; +def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>; + +def vzmovl_v2i64 : PatFrag<(ops node:$src), + (bitconvert (v2i64 (X86vzmovl + (v2i64 (scalar_to_vector (loadi64 node:$src))))))>; +def vzmovl_v4i32 : PatFrag<(ops node:$src), + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 node:$src))))))>; + +def vzload_v2i64 : PatFrag<(ops node:$src), + (bitconvert (v2i64 (X86vzload node:$src)))>; + + +def fp32imm0 : PatLeaf<(f32 fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + +// BYTE_imm - Transform bit immediates into byte immediates. +def BYTE_imm : SDNodeXForm<imm, [{ + // Transformation function: imm >> 3 + return getI32Imm(N->getZExtValue() >> 3); +}]>; + +// SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*, +// SHUFP* etc. imm. +def SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{ + return getI8Imm(X86::getShuffleSHUFImmediate(N)); +}]>; + +// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to +// PSHUFHW imm. +def SHUFFLE_get_pshufhw_imm : SDNodeXForm<vector_shuffle, [{ + return getI8Imm(X86::getShufflePSHUFHWImmediate(N)); +}]>; + +// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to +// PSHUFLW imm. +def SHUFFLE_get_pshuflw_imm : SDNodeXForm<vector_shuffle, [{ + return getI8Imm(X86::getShufflePSHUFLWImmediate(N)); +}]>; + +// SHUFFLE_get_palign_imm xform function: convert vector_shuffle mask to +// a PALIGNR imm. +def SHUFFLE_get_palign_imm : SDNodeXForm<vector_shuffle, [{ + return getI8Imm(X86::getShufflePALIGNRImmediate(N)); +}]>; + +// EXTRACT_get_vextractf128_imm xform function: convert extract_subvector index +// to VEXTRACTF128 imm. +def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{ + return getI8Imm(X86::getExtractVEXTRACTF128Immediate(N)); +}]>; + +// INSERT_get_vinsertf128_imm xform function: convert insert_subvector index to +// VINSERTF128 imm. +def INSERT_get_vinsertf128_imm : SDNodeXForm<insert_subvector, [{ + return getI8Imm(X86::getInsertVINSERTF128Immediate(N)); +}]>; + +def splat_lo : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); + return SVOp->isSplat() && SVOp->getSplatIndex() == 0; +}]>; + +def movddup : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return X86::isMOVDDUPMask(cast<ShuffleVectorSDNode>(N)); +}]>; + +def movhlps : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return X86::isMOVHLPSMask(cast<ShuffleVectorSDNode>(N)); +}]>; + +def movhlps_undef : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return X86::isMOVHLPS_v_undef_Mask(cast<ShuffleVectorSDNode>(N)); +}]>; + +def movlhps : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return X86::isMOVLHPSMask(cast<ShuffleVectorSDNode>(N)); +}]>; + +def movlp : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return X86::isMOVLPMask(cast<ShuffleVectorSDNode>(N)); +}]>; + +def movl : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return X86::isMOVLMask(cast<ShuffleVectorSDNode>(N)); +}]>; + +def unpckl : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N)); +}]>; + +def unpckh : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N)); +}]>; + +def pshufd : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N)); +}], SHUFFLE_get_shuf_imm>; + +def shufp : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return X86::isSHUFPMask(cast<ShuffleVectorSDNode>(N)); +}], SHUFFLE_get_shuf_imm>; + +def pshufhw : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return X86::isPSHUFHWMask(cast<ShuffleVectorSDNode>(N)); +}], SHUFFLE_get_pshufhw_imm>; + +def pshuflw : PatFrag<(ops node:$lhs, node:$rhs), + (vector_shuffle node:$lhs, node:$rhs), [{ + return X86::isPSHUFLWMask(cast<ShuffleVectorSDNode>(N)); +}], SHUFFLE_get_pshuflw_imm>; + +def vextractf128_extract : PatFrag<(ops node:$bigvec, node:$index), + (extract_subvector node:$bigvec, + node:$index), [{ + return X86::isVEXTRACTF128Index(N); +}], EXTRACT_get_vextractf128_imm>; + +def vinsertf128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, + node:$index), + (insert_subvector node:$bigvec, node:$smallvec, + node:$index), [{ + return X86::isVINSERTF128Index(N); +}], INSERT_get_vinsertf128_imm>; + diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp new file mode 100644 index 0000000..3a02de0 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -0,0 +1,3532 @@ +//===- X86InstrInfo.cpp - X86 Instruction Information -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "X86InstrInfo.h" +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/DerivedTypes.h" +#include "llvm/LLVMContext.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/MC/MCAsmInfo.h" +#include <limits> + +#define GET_INSTRINFO_CTOR +#include "X86GenInstrInfo.inc" + +using namespace llvm; + +static cl::opt<bool> +NoFusing("disable-spill-fusing", + cl::desc("Disable fusing of spill code into instructions")); +static cl::opt<bool> +PrintFailedFusing("print-failed-fuse-candidates", + cl::desc("Print instructions that the allocator wants to" + " fuse, but the X86 backend currently can't"), + cl::Hidden); +static cl::opt<bool> +ReMatPICStubLoad("remat-pic-stub-load", + cl::desc("Re-materialize load from stub in PIC mode"), + cl::init(false), cl::Hidden); + +enum { + // Select which memory operand is being unfolded. + // (stored in bits 0 - 7) + TB_INDEX_0 = 0, + TB_INDEX_1 = 1, + TB_INDEX_2 = 2, + TB_INDEX_MASK = 0xff, + + // Minimum alignment required for load/store. + // Used for RegOp->MemOp conversion. + // (stored in bits 8 - 15) + TB_ALIGN_SHIFT = 8, + TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT, + TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT, + TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT, + TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT, + + // Do not insert the reverse map (MemOp -> RegOp) into the table. + // This may be needed because there is a many -> one mapping. + TB_NO_REVERSE = 1 << 16, + + // Do not insert the forward map (RegOp -> MemOp) into the table. + // This is needed for Native Client, which prohibits branch + // instructions from using a memory operand. + TB_NO_FORWARD = 1 << 17, + + TB_FOLDED_LOAD = 1 << 18, + TB_FOLDED_STORE = 1 << 19 +}; + +X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) + : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit() + ? X86::ADJCALLSTACKDOWN64 + : X86::ADJCALLSTACKDOWN32), + (tm.getSubtarget<X86Subtarget>().is64Bit() + ? X86::ADJCALLSTACKUP64 + : X86::ADJCALLSTACKUP32)), + TM(tm), RI(tm, *this) { + + static const unsigned OpTbl2Addr[][3] = { + { X86::ADC32ri, X86::ADC32mi, 0 }, + { X86::ADC32ri8, X86::ADC32mi8, 0 }, + { X86::ADC32rr, X86::ADC32mr, 0 }, + { X86::ADC64ri32, X86::ADC64mi32, 0 }, + { X86::ADC64ri8, X86::ADC64mi8, 0 }, + { X86::ADC64rr, X86::ADC64mr, 0 }, + { X86::ADD16ri, X86::ADD16mi, 0 }, + { X86::ADD16ri8, X86::ADD16mi8, 0 }, + { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE }, + { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE }, + { X86::ADD16rr, X86::ADD16mr, 0 }, + { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE }, + { X86::ADD32ri, X86::ADD32mi, 0 }, + { X86::ADD32ri8, X86::ADD32mi8, 0 }, + { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE }, + { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE }, + { X86::ADD32rr, X86::ADD32mr, 0 }, + { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE }, + { X86::ADD64ri32, X86::ADD64mi32, 0 }, + { X86::ADD64ri8, X86::ADD64mi8, 0 }, + { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE }, + { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE }, + { X86::ADD64rr, X86::ADD64mr, 0 }, + { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE }, + { X86::ADD8ri, X86::ADD8mi, 0 }, + { X86::ADD8rr, X86::ADD8mr, 0 }, + { X86::AND16ri, X86::AND16mi, 0 }, + { X86::AND16ri8, X86::AND16mi8, 0 }, + { X86::AND16rr, X86::AND16mr, 0 }, + { X86::AND32ri, X86::AND32mi, 0 }, + { X86::AND32ri8, X86::AND32mi8, 0 }, + { X86::AND32rr, X86::AND32mr, 0 }, + { X86::AND64ri32, X86::AND64mi32, 0 }, + { X86::AND64ri8, X86::AND64mi8, 0 }, + { X86::AND64rr, X86::AND64mr, 0 }, + { X86::AND8ri, X86::AND8mi, 0 }, + { X86::AND8rr, X86::AND8mr, 0 }, + { X86::DEC16r, X86::DEC16m, 0 }, + { X86::DEC32r, X86::DEC32m, 0 }, + { X86::DEC64_16r, X86::DEC64_16m, 0 }, + { X86::DEC64_32r, X86::DEC64_32m, 0 }, + { X86::DEC64r, X86::DEC64m, 0 }, + { X86::DEC8r, X86::DEC8m, 0 }, + { X86::INC16r, X86::INC16m, 0 }, + { X86::INC32r, X86::INC32m, 0 }, + { X86::INC64_16r, X86::INC64_16m, 0 }, + { X86::INC64_32r, X86::INC64_32m, 0 }, + { X86::INC64r, X86::INC64m, 0 }, + { X86::INC8r, X86::INC8m, 0 }, + { X86::NEG16r, X86::NEG16m, 0 }, + { X86::NEG32r, X86::NEG32m, 0 }, + { X86::NEG64r, X86::NEG64m, 0 }, + { X86::NEG8r, X86::NEG8m, 0 }, + { X86::NOT16r, X86::NOT16m, 0 }, + { X86::NOT32r, X86::NOT32m, 0 }, + { X86::NOT64r, X86::NOT64m, 0 }, + { X86::NOT8r, X86::NOT8m, 0 }, + { X86::OR16ri, X86::OR16mi, 0 }, + { X86::OR16ri8, X86::OR16mi8, 0 }, + { X86::OR16rr, X86::OR16mr, 0 }, + { X86::OR32ri, X86::OR32mi, 0 }, + { X86::OR32ri8, X86::OR32mi8, 0 }, + { X86::OR32rr, X86::OR32mr, 0 }, + { X86::OR64ri32, X86::OR64mi32, 0 }, + { X86::OR64ri8, X86::OR64mi8, 0 }, + { X86::OR64rr, X86::OR64mr, 0 }, + { X86::OR8ri, X86::OR8mi, 0 }, + { X86::OR8rr, X86::OR8mr, 0 }, + { X86::ROL16r1, X86::ROL16m1, 0 }, + { X86::ROL16rCL, X86::ROL16mCL, 0 }, + { X86::ROL16ri, X86::ROL16mi, 0 }, + { X86::ROL32r1, X86::ROL32m1, 0 }, + { X86::ROL32rCL, X86::ROL32mCL, 0 }, + { X86::ROL32ri, X86::ROL32mi, 0 }, + { X86::ROL64r1, X86::ROL64m1, 0 }, + { X86::ROL64rCL, X86::ROL64mCL, 0 }, + { X86::ROL64ri, X86::ROL64mi, 0 }, + { X86::ROL8r1, X86::ROL8m1, 0 }, + { X86::ROL8rCL, X86::ROL8mCL, 0 }, + { X86::ROL8ri, X86::ROL8mi, 0 }, + { X86::ROR16r1, X86::ROR16m1, 0 }, + { X86::ROR16rCL, X86::ROR16mCL, 0 }, + { X86::ROR16ri, X86::ROR16mi, 0 }, + { X86::ROR32r1, X86::ROR32m1, 0 }, + { X86::ROR32rCL, X86::ROR32mCL, 0 }, + { X86::ROR32ri, X86::ROR32mi, 0 }, + { X86::ROR64r1, X86::ROR64m1, 0 }, + { X86::ROR64rCL, X86::ROR64mCL, 0 }, + { X86::ROR64ri, X86::ROR64mi, 0 }, + { X86::ROR8r1, X86::ROR8m1, 0 }, + { X86::ROR8rCL, X86::ROR8mCL, 0 }, + { X86::ROR8ri, X86::ROR8mi, 0 }, + { X86::SAR16r1, X86::SAR16m1, 0 }, + { X86::SAR16rCL, X86::SAR16mCL, 0 }, + { X86::SAR16ri, X86::SAR16mi, 0 }, + { X86::SAR32r1, X86::SAR32m1, 0 }, + { X86::SAR32rCL, X86::SAR32mCL, 0 }, + { X86::SAR32ri, X86::SAR32mi, 0 }, + { X86::SAR64r1, X86::SAR64m1, 0 }, + { X86::SAR64rCL, X86::SAR64mCL, 0 }, + { X86::SAR64ri, X86::SAR64mi, 0 }, + { X86::SAR8r1, X86::SAR8m1, 0 }, + { X86::SAR8rCL, X86::SAR8mCL, 0 }, + { X86::SAR8ri, X86::SAR8mi, 0 }, + { X86::SBB32ri, X86::SBB32mi, 0 }, + { X86::SBB32ri8, X86::SBB32mi8, 0 }, + { X86::SBB32rr, X86::SBB32mr, 0 }, + { X86::SBB64ri32, X86::SBB64mi32, 0 }, + { X86::SBB64ri8, X86::SBB64mi8, 0 }, + { X86::SBB64rr, X86::SBB64mr, 0 }, + { X86::SHL16rCL, X86::SHL16mCL, 0 }, + { X86::SHL16ri, X86::SHL16mi, 0 }, + { X86::SHL32rCL, X86::SHL32mCL, 0 }, + { X86::SHL32ri, X86::SHL32mi, 0 }, + { X86::SHL64rCL, X86::SHL64mCL, 0 }, + { X86::SHL64ri, X86::SHL64mi, 0 }, + { X86::SHL8rCL, X86::SHL8mCL, 0 }, + { X86::SHL8ri, X86::SHL8mi, 0 }, + { X86::SHLD16rrCL, X86::SHLD16mrCL, 0 }, + { X86::SHLD16rri8, X86::SHLD16mri8, 0 }, + { X86::SHLD32rrCL, X86::SHLD32mrCL, 0 }, + { X86::SHLD32rri8, X86::SHLD32mri8, 0 }, + { X86::SHLD64rrCL, X86::SHLD64mrCL, 0 }, + { X86::SHLD64rri8, X86::SHLD64mri8, 0 }, + { X86::SHR16r1, X86::SHR16m1, 0 }, + { X86::SHR16rCL, X86::SHR16mCL, 0 }, + { X86::SHR16ri, X86::SHR16mi, 0 }, + { X86::SHR32r1, X86::SHR32m1, 0 }, + { X86::SHR32rCL, X86::SHR32mCL, 0 }, + { X86::SHR32ri, X86::SHR32mi, 0 }, + { X86::SHR64r1, X86::SHR64m1, 0 }, + { X86::SHR64rCL, X86::SHR64mCL, 0 }, + { X86::SHR64ri, X86::SHR64mi, 0 }, + { X86::SHR8r1, X86::SHR8m1, 0 }, + { X86::SHR8rCL, X86::SHR8mCL, 0 }, + { X86::SHR8ri, X86::SHR8mi, 0 }, + { X86::SHRD16rrCL, X86::SHRD16mrCL, 0 }, + { X86::SHRD16rri8, X86::SHRD16mri8, 0 }, + { X86::SHRD32rrCL, X86::SHRD32mrCL, 0 }, + { X86::SHRD32rri8, X86::SHRD32mri8, 0 }, + { X86::SHRD64rrCL, X86::SHRD64mrCL, 0 }, + { X86::SHRD64rri8, X86::SHRD64mri8, 0 }, + { X86::SUB16ri, X86::SUB16mi, 0 }, + { X86::SUB16ri8, X86::SUB16mi8, 0 }, + { X86::SUB16rr, X86::SUB16mr, 0 }, + { X86::SUB32ri, X86::SUB32mi, 0 }, + { X86::SUB32ri8, X86::SUB32mi8, 0 }, + { X86::SUB32rr, X86::SUB32mr, 0 }, + { X86::SUB64ri32, X86::SUB64mi32, 0 }, + { X86::SUB64ri8, X86::SUB64mi8, 0 }, + { X86::SUB64rr, X86::SUB64mr, 0 }, + { X86::SUB8ri, X86::SUB8mi, 0 }, + { X86::SUB8rr, X86::SUB8mr, 0 }, + { X86::XOR16ri, X86::XOR16mi, 0 }, + { X86::XOR16ri8, X86::XOR16mi8, 0 }, + { X86::XOR16rr, X86::XOR16mr, 0 }, + { X86::XOR32ri, X86::XOR32mi, 0 }, + { X86::XOR32ri8, X86::XOR32mi8, 0 }, + { X86::XOR32rr, X86::XOR32mr, 0 }, + { X86::XOR64ri32, X86::XOR64mi32, 0 }, + { X86::XOR64ri8, X86::XOR64mi8, 0 }, + { X86::XOR64rr, X86::XOR64mr, 0 }, + { X86::XOR8ri, X86::XOR8mi, 0 }, + { X86::XOR8rr, X86::XOR8mr, 0 } + }; + + for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) { + unsigned RegOp = OpTbl2Addr[i][0]; + unsigned MemOp = OpTbl2Addr[i][1]; + unsigned Flags = OpTbl2Addr[i][2]; + AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable, + RegOp, MemOp, + // Index 0, folded load and store, no alignment requirement. + Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE); + } + + static const unsigned OpTbl0[][3] = { + { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD }, + { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD }, + { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD }, + { X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD }, + { X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD }, + { X86::WINCALL64r, X86::WINCALL64m, TB_FOLDED_LOAD }, + { X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD }, + { X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD }, + { X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD }, + { X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD }, + { X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD }, + { X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD }, + { X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD }, + { X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD }, + { X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD }, + { X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD }, + { X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD }, + { X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD }, + { X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD }, + { X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD }, + { X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD }, + { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::FsMOVAPDrr, X86::MOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::FsMOVAPSrr, X86::MOVSSmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD }, + { X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD }, + { X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD }, + { X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD }, + { X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD }, + { X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD }, + { X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD }, + { X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD }, + { X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD }, + { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD }, + { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE }, + { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE }, + { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE }, + { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE }, + { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE }, + { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE }, + { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE }, + { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE }, + { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE }, + { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE }, + { X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE }, + { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE }, + { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE }, + { X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE }, + { X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE }, + { X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD }, + { X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD }, + { X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD }, + { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD }, + { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE }, + { X86::SETAr, X86::SETAm, TB_FOLDED_STORE }, + { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE }, + { X86::SETBr, X86::SETBm, TB_FOLDED_STORE }, + { X86::SETEr, X86::SETEm, TB_FOLDED_STORE }, + { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE }, + { X86::SETGr, X86::SETGm, TB_FOLDED_STORE }, + { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE }, + { X86::SETLr, X86::SETLm, TB_FOLDED_STORE }, + { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE }, + { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE }, + { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE }, + { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE }, + { X86::SETOr, X86::SETOm, TB_FOLDED_STORE }, + { X86::SETPr, X86::SETPm, TB_FOLDED_STORE }, + { X86::SETSr, X86::SETSm, TB_FOLDED_STORE }, + { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD }, + { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD }, + { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD }, + { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD }, + { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD }, + { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD }, + // AVX 128-bit versions of foldable instructions + { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::FsVMOVAPDrr, X86::VMOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::FsVMOVAPSrr, X86::VMOVSSmr, TB_FOLDED_STORE | TB_NO_REVERSE }, + { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE }, + { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE }, + { X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE }, + { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE }, + { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE }, + { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE }, + // AVX 256-bit foldable instructions + { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE }, + { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE } + }; + + for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) { + unsigned RegOp = OpTbl0[i][0]; + unsigned MemOp = OpTbl0[i][1]; + unsigned Flags = OpTbl0[i][2]; + AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable, + RegOp, MemOp, TB_INDEX_0 | Flags); + } + + static const unsigned OpTbl1[][3] = { + { X86::CMP16rr, X86::CMP16rm, 0 }, + { X86::CMP32rr, X86::CMP32rm, 0 }, + { X86::CMP64rr, X86::CMP64rm, 0 }, + { X86::CMP8rr, X86::CMP8rm, 0 }, + { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 }, + { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 }, + { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 }, + { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 }, + { X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 }, + { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 }, + { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 }, + { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 }, + { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 }, + { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 }, + { X86::FsMOVAPDrr, X86::MOVSDrm, TB_NO_REVERSE }, + { X86::FsMOVAPSrr, X86::MOVSSrm, TB_NO_REVERSE }, + { X86::IMUL16rri, X86::IMUL16rmi, 0 }, + { X86::IMUL16rri8, X86::IMUL16rmi8, 0 }, + { X86::IMUL32rri, X86::IMUL32rmi, 0 }, + { X86::IMUL32rri8, X86::IMUL32rmi8, 0 }, + { X86::IMUL64rri32, X86::IMUL64rmi32, 0 }, + { X86::IMUL64rri8, X86::IMUL64rmi8, 0 }, + { X86::Int_COMISDrr, X86::Int_COMISDrm, 0 }, + { X86::Int_COMISSrr, X86::Int_COMISSrm, 0 }, + { X86::Int_CVTDQ2PDrr, X86::Int_CVTDQ2PDrm, TB_ALIGN_16 }, + { X86::Int_CVTDQ2PSrr, X86::Int_CVTDQ2PSrm, TB_ALIGN_16 }, + { X86::Int_CVTPD2DQrr, X86::Int_CVTPD2DQrm, TB_ALIGN_16 }, + { X86::Int_CVTPD2PSrr, X86::Int_CVTPD2PSrm, TB_ALIGN_16 }, + { X86::Int_CVTPS2DQrr, X86::Int_CVTPS2DQrm, TB_ALIGN_16 }, + { X86::Int_CVTPS2PDrr, X86::Int_CVTPS2PDrm, 0 }, + { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 }, + { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 }, + { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, 0 }, + { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 }, + { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 }, + { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 }, + { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 }, + { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, 0 }, + { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, + { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, + { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 }, + { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 }, + { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 }, + { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 }, + { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, 0 }, + { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, 0 }, + { X86::MOV16rr, X86::MOV16rm, 0 }, + { X86::MOV32rr, X86::MOV32rm, 0 }, + { X86::MOV64rr, X86::MOV64rm, 0 }, + { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 }, + { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 }, + { X86::MOV8rr, X86::MOV8rm, 0 }, + { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 }, + { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 }, + { X86::MOVDDUPrr, X86::MOVDDUPrm, 0 }, + { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 }, + { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 }, + { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 }, + { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 }, + { X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 }, + { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 }, + { X86::MOVSX32rr16, X86::MOVSX32rm16, 0 }, + { X86::MOVSX32rr8, X86::MOVSX32rm8, 0 }, + { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 }, + { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 }, + { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 }, + { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 }, + { X86::MOVUPSrr, X86::MOVUPSrm, 0 }, + { X86::MOVZDI2PDIrr, X86::MOVZDI2PDIrm, 0 }, + { X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 }, + { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 }, + { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 }, + { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 }, + { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 }, + { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 }, + { X86::MOVZX64rr16, X86::MOVZX64rm16, 0 }, + { X86::MOVZX64rr32, X86::MOVZX64rm32, 0 }, + { X86::MOVZX64rr8, X86::MOVZX64rm8, 0 }, + { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 }, + { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 }, + { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 }, + { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 }, + { X86::RCPPSr_Int, X86::RCPPSm_Int, TB_ALIGN_16 }, + { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 }, + { X86::RSQRTPSr_Int, X86::RSQRTPSm_Int, TB_ALIGN_16 }, + { X86::RSQRTSSr, X86::RSQRTSSm, 0 }, + { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 }, + { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 }, + { X86::SQRTPDr_Int, X86::SQRTPDm_Int, TB_ALIGN_16 }, + { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 }, + { X86::SQRTPSr_Int, X86::SQRTPSm_Int, TB_ALIGN_16 }, + { X86::SQRTSDr, X86::SQRTSDm, 0 }, + { X86::SQRTSDr_Int, X86::SQRTSDm_Int, 0 }, + { X86::SQRTSSr, X86::SQRTSSm, 0 }, + { X86::SQRTSSr_Int, X86::SQRTSSm_Int, 0 }, + { X86::TEST16rr, X86::TEST16rm, 0 }, + { X86::TEST32rr, X86::TEST32rm, 0 }, + { X86::TEST64rr, X86::TEST64rm, 0 }, + { X86::TEST8rr, X86::TEST8rm, 0 }, + // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0 + { X86::UCOMISDrr, X86::UCOMISDrm, 0 }, + { X86::UCOMISSrr, X86::UCOMISSrm, 0 }, + // AVX 128-bit versions of foldable instructions + { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 }, + { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 }, + { X86::Int_VCVTDQ2PDrr, X86::Int_VCVTDQ2PDrm, TB_ALIGN_16 }, + { X86::Int_VCVTDQ2PSrr, X86::Int_VCVTDQ2PSrm, TB_ALIGN_16 }, + { X86::Int_VCVTPD2DQrr, X86::Int_VCVTPD2DQrm, TB_ALIGN_16 }, + { X86::Int_VCVTPD2PSrr, X86::Int_VCVTPD2PSrm, TB_ALIGN_16 }, + { X86::Int_VCVTPS2DQrr, X86::Int_VCVTPS2DQrm, TB_ALIGN_16 }, + { X86::Int_VCVTPS2PDrr, X86::Int_VCVTPS2PDrm, 0 }, + { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, 0 }, + { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, 0 }, + { X86::FsVMOVAPDrr, X86::VMOVSDrm, TB_NO_REVERSE }, + { X86::FsVMOVAPSrr, X86::VMOVSSrm, TB_NO_REVERSE }, + { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, + { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 }, + { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 }, + { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 }, + { X86::VMOVDDUPrr, X86::VMOVDDUPrm, 0 }, + { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 }, + { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 }, + { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 }, + { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, TB_ALIGN_16 }, + { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, TB_ALIGN_16 }, + { X86::VMOVUPDrr, X86::VMOVUPDrm, TB_ALIGN_16 }, + { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, + { X86::VMOVZDI2PDIrr, X86::VMOVZDI2PDIrm, 0 }, + { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 }, + { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 }, + { X86::VPSHUFDri, X86::VPSHUFDmi, TB_ALIGN_16 }, + { X86::VPSHUFHWri, X86::VPSHUFHWmi, TB_ALIGN_16 }, + { X86::VPSHUFLWri, X86::VPSHUFLWmi, TB_ALIGN_16 }, + { X86::VRCPPSr, X86::VRCPPSm, TB_ALIGN_16 }, + { X86::VRCPPSr_Int, X86::VRCPPSm_Int, TB_ALIGN_16 }, + { X86::VRSQRTPSr, X86::VRSQRTPSm, TB_ALIGN_16 }, + { X86::VRSQRTPSr_Int, X86::VRSQRTPSm_Int, TB_ALIGN_16 }, + { X86::VSQRTPDr, X86::VSQRTPDm, TB_ALIGN_16 }, + { X86::VSQRTPDr_Int, X86::VSQRTPDm_Int, TB_ALIGN_16 }, + { X86::VSQRTPSr, X86::VSQRTPSm, TB_ALIGN_16 }, + { X86::VSQRTPSr_Int, X86::VSQRTPSm_Int, TB_ALIGN_16 }, + { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 }, + { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 }, + // AVX 256-bit foldable instructions + { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, + { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 }, + { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_16 }, + { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 }, + { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 } + }; + + for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) { + unsigned RegOp = OpTbl1[i][0]; + unsigned MemOp = OpTbl1[i][1]; + unsigned Flags = OpTbl1[i][2]; + AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable, + RegOp, MemOp, + // Index 1, folded load + Flags | TB_INDEX_1 | TB_FOLDED_LOAD); + } + + static const unsigned OpTbl2[][3] = { + { X86::ADC32rr, X86::ADC32rm, 0 }, + { X86::ADC64rr, X86::ADC64rm, 0 }, + { X86::ADD16rr, X86::ADD16rm, 0 }, + { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE }, + { X86::ADD32rr, X86::ADD32rm, 0 }, + { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE }, + { X86::ADD64rr, X86::ADD64rm, 0 }, + { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE }, + { X86::ADD8rr, X86::ADD8rm, 0 }, + { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 }, + { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 }, + { X86::ADDSDrr, X86::ADDSDrm, 0 }, + { X86::ADDSSrr, X86::ADDSSrm, 0 }, + { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 }, + { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 }, + { X86::AND16rr, X86::AND16rm, 0 }, + { X86::AND32rr, X86::AND32rm, 0 }, + { X86::AND64rr, X86::AND64rm, 0 }, + { X86::AND8rr, X86::AND8rm, 0 }, + { X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 }, + { X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 }, + { X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 }, + { X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 }, + { X86::CMOVA16rr, X86::CMOVA16rm, 0 }, + { X86::CMOVA32rr, X86::CMOVA32rm, 0 }, + { X86::CMOVA64rr, X86::CMOVA64rm, 0 }, + { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 }, + { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 }, + { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 }, + { X86::CMOVB16rr, X86::CMOVB16rm, 0 }, + { X86::CMOVB32rr, X86::CMOVB32rm, 0 }, + { X86::CMOVB64rr, X86::CMOVB64rm, 0 }, + { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 }, + { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 }, + { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 }, + { X86::CMOVE16rr, X86::CMOVE16rm, 0 }, + { X86::CMOVE32rr, X86::CMOVE32rm, 0 }, + { X86::CMOVE64rr, X86::CMOVE64rm, 0 }, + { X86::CMOVG16rr, X86::CMOVG16rm, 0 }, + { X86::CMOVG32rr, X86::CMOVG32rm, 0 }, + { X86::CMOVG64rr, X86::CMOVG64rm, 0 }, + { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 }, + { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 }, + { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 }, + { X86::CMOVL16rr, X86::CMOVL16rm, 0 }, + { X86::CMOVL32rr, X86::CMOVL32rm, 0 }, + { X86::CMOVL64rr, X86::CMOVL64rm, 0 }, + { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 }, + { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 }, + { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 }, + { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 }, + { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 }, + { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 }, + { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 }, + { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 }, + { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 }, + { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 }, + { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 }, + { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 }, + { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 }, + { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 }, + { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 }, + { X86::CMOVO16rr, X86::CMOVO16rm, 0 }, + { X86::CMOVO32rr, X86::CMOVO32rm, 0 }, + { X86::CMOVO64rr, X86::CMOVO64rm, 0 }, + { X86::CMOVP16rr, X86::CMOVP16rm, 0 }, + { X86::CMOVP32rr, X86::CMOVP32rm, 0 }, + { X86::CMOVP64rr, X86::CMOVP64rm, 0 }, + { X86::CMOVS16rr, X86::CMOVS16rm, 0 }, + { X86::CMOVS32rr, X86::CMOVS32rm, 0 }, + { X86::CMOVS64rr, X86::CMOVS64rm, 0 }, + { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 }, + { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 }, + { X86::CMPSDrr, X86::CMPSDrm, 0 }, + { X86::CMPSSrr, X86::CMPSSrm, 0 }, + { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 }, + { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 }, + { X86::DIVSDrr, X86::DIVSDrm, 0 }, + { X86::DIVSSrr, X86::DIVSSrm, 0 }, + { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 }, + { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 }, + { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 }, + { X86::FsANDPSrr, X86::FsANDPSrm, TB_ALIGN_16 }, + { X86::FsORPDrr, X86::FsORPDrm, TB_ALIGN_16 }, + { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 }, + { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 }, + { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 }, + { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 }, + { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 }, + { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 }, + { X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 }, + { X86::IMUL16rr, X86::IMUL16rm, 0 }, + { X86::IMUL32rr, X86::IMUL32rm, 0 }, + { X86::IMUL64rr, X86::IMUL64rm, 0 }, + { X86::Int_CMPSDrr, X86::Int_CMPSDrm, 0 }, + { X86::Int_CMPSSrr, X86::Int_CMPSSrm, 0 }, + { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 }, + { X86::MAXPDrr_Int, X86::MAXPDrm_Int, TB_ALIGN_16 }, + { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 }, + { X86::MAXPSrr_Int, X86::MAXPSrm_Int, TB_ALIGN_16 }, + { X86::MAXSDrr, X86::MAXSDrm, 0 }, + { X86::MAXSDrr_Int, X86::MAXSDrm_Int, 0 }, + { X86::MAXSSrr, X86::MAXSSrm, 0 }, + { X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 }, + { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 }, + { X86::MINPDrr_Int, X86::MINPDrm_Int, TB_ALIGN_16 }, + { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 }, + { X86::MINPSrr_Int, X86::MINPSrm_Int, TB_ALIGN_16 }, + { X86::MINSDrr, X86::MINSDrm, 0 }, + { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 }, + { X86::MINSSrr, X86::MINSSrm, 0 }, + { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 }, + { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, + { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, + { X86::MULSDrr, X86::MULSDrm, 0 }, + { X86::MULSSrr, X86::MULSSrm, 0 }, + { X86::OR16rr, X86::OR16rm, 0 }, + { X86::OR32rr, X86::OR32rm, 0 }, + { X86::OR64rr, X86::OR64rm, 0 }, + { X86::OR8rr, X86::OR8rm, 0 }, + { X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 }, + { X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 }, + { X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 }, + { X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 }, + { X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 }, + { X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 }, + { X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 }, + { X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 }, + { X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 }, + { X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 }, + { X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 }, + { X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 }, + { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 }, + { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 }, + { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 }, + { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 }, + { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 }, + { X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 }, + { X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 }, + { X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 }, + { X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 }, + { X86::PINSRWrri, X86::PINSRWrmi, TB_ALIGN_16 }, + { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 }, + { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 }, + { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 }, + { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 }, + { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 }, + { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 }, + { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 }, + { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 }, + { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 }, + { X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 }, + { X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 }, + { X86::PORrr, X86::PORrm, TB_ALIGN_16 }, + { X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 }, + { X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 }, + { X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 }, + { X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 }, + { X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 }, + { X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 }, + { X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 }, + { X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 }, + { X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 }, + { X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 }, + { X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 }, + { X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 }, + { X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 }, + { X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 }, + { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 }, + { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 }, + { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 }, + { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 }, + { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 }, + { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 }, + { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 }, + { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 }, + { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 }, + { X86::SBB32rr, X86::SBB32rm, 0 }, + { X86::SBB64rr, X86::SBB64rm, 0 }, + { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 }, + { X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 }, + { X86::SUB16rr, X86::SUB16rm, 0 }, + { X86::SUB32rr, X86::SUB32rm, 0 }, + { X86::SUB64rr, X86::SUB64rm, 0 }, + { X86::SUB8rr, X86::SUB8rm, 0 }, + { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 }, + { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 }, + { X86::SUBSDrr, X86::SUBSDrm, 0 }, + { X86::SUBSSrr, X86::SUBSSrm, 0 }, + // FIXME: TEST*rr -> swapped operand of TEST*mr. + { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 }, + { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 }, + { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 }, + { X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 }, + { X86::XOR16rr, X86::XOR16rm, 0 }, + { X86::XOR32rr, X86::XOR32rm, 0 }, + { X86::XOR64rr, X86::XOR64rm, 0 }, + { X86::XOR8rr, X86::XOR8rm, 0 }, + { X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 }, + { X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 }, + // AVX 128-bit versions of foldable instructions + { X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 }, + { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, 0 }, + { X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 }, + { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 }, + { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 }, + { X86::Int_VCVTSI2SDrr, X86::Int_VCVTSI2SDrm, 0 }, + { X86::VCVTSI2SS64rr, X86::VCVTSI2SS64rm, 0 }, + { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 }, + { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 }, + { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, + { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 }, + { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 }, + { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 }, + { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm, 0 }, + { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 }, + { X86::Int_VCVTTSD2SIrr, X86::Int_VCVTTSD2SIrm, 0 }, + { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 }, + { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm, 0 }, + { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 }, + { X86::Int_VCVTTSS2SIrr, X86::Int_VCVTTSS2SIrm, 0 }, + { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 }, + { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 }, + { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, TB_ALIGN_16 }, + { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, TB_ALIGN_16 }, + { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, + { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, + { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, + { X86::VADDPDrr, X86::VADDPDrm, TB_ALIGN_16 }, + { X86::VADDPSrr, X86::VADDPSrm, TB_ALIGN_16 }, + { X86::VADDSDrr, X86::VADDSDrm, 0 }, + { X86::VADDSSrr, X86::VADDSSrm, 0 }, + { X86::VADDSUBPDrr, X86::VADDSUBPDrm, TB_ALIGN_16 }, + { X86::VADDSUBPSrr, X86::VADDSUBPSrm, TB_ALIGN_16 }, + { X86::VANDNPDrr, X86::VANDNPDrm, TB_ALIGN_16 }, + { X86::VANDNPSrr, X86::VANDNPSrm, TB_ALIGN_16 }, + { X86::VANDPDrr, X86::VANDPDrm, TB_ALIGN_16 }, + { X86::VANDPSrr, X86::VANDPSrm, TB_ALIGN_16 }, + { X86::VCMPPDrri, X86::VCMPPDrmi, TB_ALIGN_16 }, + { X86::VCMPPSrri, X86::VCMPPSrmi, TB_ALIGN_16 }, + { X86::VCMPSDrr, X86::VCMPSDrm, 0 }, + { X86::VCMPSSrr, X86::VCMPSSrm, 0 }, + { X86::VDIVPDrr, X86::VDIVPDrm, TB_ALIGN_16 }, + { X86::VDIVPSrr, X86::VDIVPSrm, TB_ALIGN_16 }, + { X86::VDIVSDrr, X86::VDIVSDrm, 0 }, + { X86::VDIVSSrr, X86::VDIVSSrm, 0 }, + { X86::VFsANDNPDrr, X86::VFsANDNPDrm, TB_ALIGN_16 }, + { X86::VFsANDNPSrr, X86::VFsANDNPSrm, TB_ALIGN_16 }, + { X86::VFsANDPDrr, X86::VFsANDPDrm, TB_ALIGN_16 }, + { X86::VFsANDPSrr, X86::VFsANDPSrm, TB_ALIGN_16 }, + { X86::VFsORPDrr, X86::VFsORPDrm, TB_ALIGN_16 }, + { X86::VFsORPSrr, X86::VFsORPSrm, TB_ALIGN_16 }, + { X86::VFsXORPDrr, X86::VFsXORPDrm, TB_ALIGN_16 }, + { X86::VFsXORPSrr, X86::VFsXORPSrm, TB_ALIGN_16 }, + { X86::VHADDPDrr, X86::VHADDPDrm, TB_ALIGN_16 }, + { X86::VHADDPSrr, X86::VHADDPSrm, TB_ALIGN_16 }, + { X86::VHSUBPDrr, X86::VHSUBPDrm, TB_ALIGN_16 }, + { X86::VHSUBPSrr, X86::VHSUBPSrm, TB_ALIGN_16 }, + { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, 0 }, + { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, 0 }, + { X86::VMAXPDrr, X86::VMAXPDrm, TB_ALIGN_16 }, + { X86::VMAXPDrr_Int, X86::VMAXPDrm_Int, TB_ALIGN_16 }, + { X86::VMAXPSrr, X86::VMAXPSrm, TB_ALIGN_16 }, + { X86::VMAXPSrr_Int, X86::VMAXPSrm_Int, TB_ALIGN_16 }, + { X86::VMAXSDrr, X86::VMAXSDrm, 0 }, + { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, 0 }, + { X86::VMAXSSrr, X86::VMAXSSrm, 0 }, + { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 }, + { X86::VMINPDrr, X86::VMINPDrm, TB_ALIGN_16 }, + { X86::VMINPDrr_Int, X86::VMINPDrm_Int, TB_ALIGN_16 }, + { X86::VMINPSrr, X86::VMINPSrm, TB_ALIGN_16 }, + { X86::VMINPSrr_Int, X86::VMINPSrm_Int, TB_ALIGN_16 }, + { X86::VMINSDrr, X86::VMINSDrm, 0 }, + { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 }, + { X86::VMINSSrr, X86::VMINSSrm, 0 }, + { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 }, + { X86::VMULPDrr, X86::VMULPDrm, TB_ALIGN_16 }, + { X86::VMULPSrr, X86::VMULPSrm, TB_ALIGN_16 }, + { X86::VMULSDrr, X86::VMULSDrm, 0 }, + { X86::VMULSSrr, X86::VMULSSrm, 0 }, + { X86::VORPDrr, X86::VORPDrm, TB_ALIGN_16 }, + { X86::VORPSrr, X86::VORPSrm, TB_ALIGN_16 }, + { X86::VPACKSSDWrr, X86::VPACKSSDWrm, TB_ALIGN_16 }, + { X86::VPACKSSWBrr, X86::VPACKSSWBrm, TB_ALIGN_16 }, + { X86::VPACKUSWBrr, X86::VPACKUSWBrm, TB_ALIGN_16 }, + { X86::VPADDBrr, X86::VPADDBrm, TB_ALIGN_16 }, + { X86::VPADDDrr, X86::VPADDDrm, TB_ALIGN_16 }, + { X86::VPADDQrr, X86::VPADDQrm, TB_ALIGN_16 }, + { X86::VPADDSBrr, X86::VPADDSBrm, TB_ALIGN_16 }, + { X86::VPADDSWrr, X86::VPADDSWrm, TB_ALIGN_16 }, + { X86::VPADDWrr, X86::VPADDWrm, TB_ALIGN_16 }, + { X86::VPANDNrr, X86::VPANDNrm, TB_ALIGN_16 }, + { X86::VPANDrr, X86::VPANDrm, TB_ALIGN_16 }, + { X86::VPCMPEQBrr, X86::VPCMPEQBrm, TB_ALIGN_16 }, + { X86::VPCMPEQDrr, X86::VPCMPEQDrm, TB_ALIGN_16 }, + { X86::VPCMPEQWrr, X86::VPCMPEQWrm, TB_ALIGN_16 }, + { X86::VPCMPGTBrr, X86::VPCMPGTBrm, TB_ALIGN_16 }, + { X86::VPCMPGTDrr, X86::VPCMPGTDrm, TB_ALIGN_16 }, + { X86::VPCMPGTWrr, X86::VPCMPGTWrm, TB_ALIGN_16 }, + { X86::VPINSRWrri, X86::VPINSRWrmi, TB_ALIGN_16 }, + { X86::VPMADDWDrr, X86::VPMADDWDrm, TB_ALIGN_16 }, + { X86::VPMAXSWrr, X86::VPMAXSWrm, TB_ALIGN_16 }, + { X86::VPMAXUBrr, X86::VPMAXUBrm, TB_ALIGN_16 }, + { X86::VPMINSWrr, X86::VPMINSWrm, TB_ALIGN_16 }, + { X86::VPMINUBrr, X86::VPMINUBrm, TB_ALIGN_16 }, + { X86::VPMULDQrr, X86::VPMULDQrm, TB_ALIGN_16 }, + { X86::VPMULHUWrr, X86::VPMULHUWrm, TB_ALIGN_16 }, + { X86::VPMULHWrr, X86::VPMULHWrm, TB_ALIGN_16 }, + { X86::VPMULLDrr, X86::VPMULLDrm, TB_ALIGN_16 }, + { X86::VPMULLWrr, X86::VPMULLWrm, TB_ALIGN_16 }, + { X86::VPMULUDQrr, X86::VPMULUDQrm, TB_ALIGN_16 }, + { X86::VPORrr, X86::VPORrm, TB_ALIGN_16 }, + { X86::VPSADBWrr, X86::VPSADBWrm, TB_ALIGN_16 }, + { X86::VPSLLDrr, X86::VPSLLDrm, TB_ALIGN_16 }, + { X86::VPSLLQrr, X86::VPSLLQrm, TB_ALIGN_16 }, + { X86::VPSLLWrr, X86::VPSLLWrm, TB_ALIGN_16 }, + { X86::VPSRADrr, X86::VPSRADrm, TB_ALIGN_16 }, + { X86::VPSRAWrr, X86::VPSRAWrm, TB_ALIGN_16 }, + { X86::VPSRLDrr, X86::VPSRLDrm, TB_ALIGN_16 }, + { X86::VPSRLQrr, X86::VPSRLQrm, TB_ALIGN_16 }, + { X86::VPSRLWrr, X86::VPSRLWrm, TB_ALIGN_16 }, + { X86::VPSUBBrr, X86::VPSUBBrm, TB_ALIGN_16 }, + { X86::VPSUBDrr, X86::VPSUBDrm, TB_ALIGN_16 }, + { X86::VPSUBSBrr, X86::VPSUBSBrm, TB_ALIGN_16 }, + { X86::VPSUBSWrr, X86::VPSUBSWrm, TB_ALIGN_16 }, + { X86::VPSUBWrr, X86::VPSUBWrm, TB_ALIGN_16 }, + { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, TB_ALIGN_16 }, + { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, TB_ALIGN_16 }, + { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, TB_ALIGN_16 }, + { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, TB_ALIGN_16 }, + { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, TB_ALIGN_16 }, + { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, TB_ALIGN_16 }, + { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, TB_ALIGN_16 }, + { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, TB_ALIGN_16 }, + { X86::VPXORrr, X86::VPXORrm, TB_ALIGN_16 }, + { X86::VSHUFPDrri, X86::VSHUFPDrmi, TB_ALIGN_16 }, + { X86::VSHUFPSrri, X86::VSHUFPSrmi, TB_ALIGN_16 }, + { X86::VSUBPDrr, X86::VSUBPDrm, TB_ALIGN_16 }, + { X86::VSUBPSrr, X86::VSUBPSrm, TB_ALIGN_16 }, + { X86::VSUBSDrr, X86::VSUBSDrm, 0 }, + { X86::VSUBSSrr, X86::VSUBSSrm, 0 }, + { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, TB_ALIGN_16 }, + { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, TB_ALIGN_16 }, + { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, TB_ALIGN_16 }, + { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, TB_ALIGN_16 }, + { X86::VXORPDrr, X86::VXORPDrm, TB_ALIGN_16 }, + { X86::VXORPSrr, X86::VXORPSrm, TB_ALIGN_16 } + // FIXME: add AVX 256-bit foldable instructions + }; + + for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) { + unsigned RegOp = OpTbl2[i][0]; + unsigned MemOp = OpTbl2[i][1]; + unsigned Flags = OpTbl2[i][2]; + AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable, + RegOp, MemOp, + // Index 2, folded load + Flags | TB_INDEX_2 | TB_FOLDED_LOAD); + } +} + +void +X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable, + MemOp2RegOpTableType &M2RTable, + unsigned RegOp, unsigned MemOp, unsigned Flags) { + if ((Flags & TB_NO_FORWARD) == 0) { + assert(!R2MTable.count(RegOp) && "Duplicate entry!"); + R2MTable[RegOp] = std::make_pair(MemOp, Flags); + } + if ((Flags & TB_NO_REVERSE) == 0) { + assert(!M2RTable.count(MemOp) && + "Duplicated entries in unfolding maps?"); + M2RTable[MemOp] = std::make_pair(RegOp, Flags); + } +} + +bool +X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const { + switch (MI.getOpcode()) { + default: break; + case X86::MOVSX16rr8: + case X86::MOVZX16rr8: + case X86::MOVSX32rr8: + case X86::MOVZX32rr8: + case X86::MOVSX64rr8: + case X86::MOVZX64rr8: + if (!TM.getSubtarget<X86Subtarget>().is64Bit()) + // It's not always legal to reference the low 8-bit of the larger + // register in 32-bit mode. + return false; + case X86::MOVSX32rr16: + case X86::MOVZX32rr16: + case X86::MOVSX64rr16: + case X86::MOVZX64rr16: + case X86::MOVSX64rr32: + case X86::MOVZX64rr32: { + if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) + // Be conservative. + return false; + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + switch (MI.getOpcode()) { + default: + llvm_unreachable(0); + break; + case X86::MOVSX16rr8: + case X86::MOVZX16rr8: + case X86::MOVSX32rr8: + case X86::MOVZX32rr8: + case X86::MOVSX64rr8: + case X86::MOVZX64rr8: + SubIdx = X86::sub_8bit; + break; + case X86::MOVSX32rr16: + case X86::MOVZX32rr16: + case X86::MOVSX64rr16: + case X86::MOVZX64rr16: + SubIdx = X86::sub_16bit; + break; + case X86::MOVSX64rr32: + case X86::MOVZX64rr32: + SubIdx = X86::sub_32bit; + break; + } + return true; + } + } + return false; +} + +/// isFrameOperand - Return true and the FrameIndex if the specified +/// operand and follow operands form a reference to the stack frame. +bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op, + int &FrameIndex) const { + if (MI->getOperand(Op).isFI() && MI->getOperand(Op+1).isImm() && + MI->getOperand(Op+2).isReg() && MI->getOperand(Op+3).isImm() && + MI->getOperand(Op+1).getImm() == 1 && + MI->getOperand(Op+2).getReg() == 0 && + MI->getOperand(Op+3).getImm() == 0) { + FrameIndex = MI->getOperand(Op).getIndex(); + return true; + } + return false; +} + +static bool isFrameLoadOpcode(int Opcode) { + switch (Opcode) { + default: break; + case X86::MOV8rm: + case X86::MOV16rm: + case X86::MOV32rm: + case X86::MOV64rm: + case X86::LD_Fp64m: + case X86::MOVSSrm: + case X86::MOVSDrm: + case X86::MOVAPSrm: + case X86::MOVAPDrm: + case X86::MOVDQArm: + case X86::VMOVSSrm: + case X86::VMOVSDrm: + case X86::VMOVAPSrm: + case X86::VMOVAPDrm: + case X86::VMOVDQArm: + case X86::VMOVAPSYrm: + case X86::VMOVAPDYrm: + case X86::VMOVDQAYrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + return true; + break; + } + return false; +} + +static bool isFrameStoreOpcode(int Opcode) { + switch (Opcode) { + default: break; + case X86::MOV8mr: + case X86::MOV16mr: + case X86::MOV32mr: + case X86::MOV64mr: + case X86::ST_FpP64m: + case X86::MOVSSmr: + case X86::MOVSDmr: + case X86::MOVAPSmr: + case X86::MOVAPDmr: + case X86::MOVDQAmr: + case X86::VMOVSSmr: + case X86::VMOVSDmr: + case X86::VMOVAPSmr: + case X86::VMOVAPDmr: + case X86::VMOVDQAmr: + case X86::VMOVAPSYmr: + case X86::VMOVAPDYmr: + case X86::VMOVDQAYmr: + case X86::MMX_MOVD64mr: + case X86::MMX_MOVQ64mr: + case X86::MMX_MOVNTQmr: + return true; + } + return false; +} + +unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + if (isFrameLoadOpcode(MI->getOpcode())) + if (MI->getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex)) + return MI->getOperand(0).getReg(); + return 0; +} + +unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { + if (isFrameLoadOpcode(MI->getOpcode())) { + unsigned Reg; + if ((Reg = isLoadFromStackSlot(MI, FrameIndex))) + return Reg; + // Check for post-frame index elimination operations + const MachineMemOperand *Dummy; + return hasLoadFromStackSlot(MI, Dummy, FrameIndex); + } + return 0; +} + +unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + if (isFrameStoreOpcode(MI->getOpcode())) + if (MI->getOperand(X86::AddrNumOperands).getSubReg() == 0 && + isFrameOperand(MI, 0, FrameIndex)) + return MI->getOperand(X86::AddrNumOperands).getReg(); + return 0; +} + +unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { + if (isFrameStoreOpcode(MI->getOpcode())) { + unsigned Reg; + if ((Reg = isStoreToStackSlot(MI, FrameIndex))) + return Reg; + // Check for post-frame index elimination operations + const MachineMemOperand *Dummy; + return hasStoreToStackSlot(MI, Dummy, FrameIndex); + } + return 0; +} + +/// regIsPICBase - Return true if register is PIC base (i.e.g defined by +/// X86::MOVPC32r. +static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) { + bool isPICBase = false; + for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg), + E = MRI.def_end(); I != E; ++I) { + MachineInstr *DefMI = I.getOperand().getParent(); + if (DefMI->getOpcode() != X86::MOVPC32r) + return false; + assert(!isPICBase && "More than one PIC base?"); + isPICBase = true; + } + return isPICBase; +} + +bool +X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const { + switch (MI->getOpcode()) { + default: break; + case X86::MOV8rm: + case X86::MOV16rm: + case X86::MOV32rm: + case X86::MOV64rm: + case X86::LD_Fp64m: + case X86::MOVSSrm: + case X86::MOVSDrm: + case X86::MOVAPSrm: + case X86::MOVUPSrm: + case X86::MOVAPDrm: + case X86::MOVDQArm: + case X86::VMOVSSrm: + case X86::VMOVSDrm: + case X86::VMOVAPSrm: + case X86::VMOVUPSrm: + case X86::VMOVAPDrm: + case X86::VMOVDQArm: + case X86::VMOVAPSYrm: + case X86::VMOVUPSYrm: + case X86::VMOVAPDYrm: + case X86::VMOVDQAYrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + case X86::FsVMOVAPSrm: + case X86::FsVMOVAPDrm: + case X86::FsMOVAPSrm: + case X86::FsMOVAPDrm: { + // Loads from constant pools are trivially rematerializable. + if (MI->getOperand(1).isReg() && + MI->getOperand(2).isImm() && + MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 && + MI->isInvariantLoad(AA)) { + unsigned BaseReg = MI->getOperand(1).getReg(); + if (BaseReg == 0 || BaseReg == X86::RIP) + return true; + // Allow re-materialization of PIC load. + if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal()) + return false; + const MachineFunction &MF = *MI->getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + bool isPICBase = false; + for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg), + E = MRI.def_end(); I != E; ++I) { + MachineInstr *DefMI = I.getOperand().getParent(); + if (DefMI->getOpcode() != X86::MOVPC32r) + return false; + assert(!isPICBase && "More than one PIC base?"); + isPICBase = true; + } + return isPICBase; + } + return false; + } + + case X86::LEA32r: + case X86::LEA64r: { + if (MI->getOperand(2).isImm() && + MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 && + !MI->getOperand(4).isReg()) { + // lea fi#, lea GV, etc. are all rematerializable. + if (!MI->getOperand(1).isReg()) + return true; + unsigned BaseReg = MI->getOperand(1).getReg(); + if (BaseReg == 0) + return true; + // Allow re-materialization of lea PICBase + x. + const MachineFunction &MF = *MI->getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + return regIsPICBase(BaseReg, MRI); + } + return false; + } + } + + // All other instructions marked M_REMATERIALIZABLE are always trivially + // rematerializable. + return true; +} + +/// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction that +/// would clobber the EFLAGS condition register. Note the result may be +/// conservative. If it cannot definitely determine the safety after visiting +/// a few instructions in each direction it assumes it's not safe. +static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + MachineBasicBlock::iterator E = MBB.end(); + + // For compile time consideration, if we are not able to determine the + // safety after visiting 4 instructions in each direction, we will assume + // it's not safe. + MachineBasicBlock::iterator Iter = I; + for (unsigned i = 0; Iter != E && i < 4; ++i) { + bool SeenDef = false; + for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) { + MachineOperand &MO = Iter->getOperand(j); + if (!MO.isReg()) + continue; + if (MO.getReg() == X86::EFLAGS) { + if (MO.isUse()) + return false; + SeenDef = true; + } + } + + if (SeenDef) + // This instruction defines EFLAGS, no need to look any further. + return true; + ++Iter; + // Skip over DBG_VALUE. + while (Iter != E && Iter->isDebugValue()) + ++Iter; + } + + // It is safe to clobber EFLAGS at the end of a block of no successor has it + // live in. + if (Iter == E) { + for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), + SE = MBB.succ_end(); SI != SE; ++SI) + if ((*SI)->isLiveIn(X86::EFLAGS)) + return false; + return true; + } + + MachineBasicBlock::iterator B = MBB.begin(); + Iter = I; + for (unsigned i = 0; i < 4; ++i) { + // If we make it to the beginning of the block, it's safe to clobber + // EFLAGS iff EFLAGS is not live-in. + if (Iter == B) + return !MBB.isLiveIn(X86::EFLAGS); + + --Iter; + // Skip over DBG_VALUE. + while (Iter != B && Iter->isDebugValue()) + --Iter; + + bool SawKill = false; + for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) { + MachineOperand &MO = Iter->getOperand(j); + if (MO.isReg() && MO.getReg() == X86::EFLAGS) { + if (MO.isDef()) return MO.isDead(); + if (MO.isKill()) SawKill = true; + } + } + + if (SawKill) + // This instruction kills EFLAGS and doesn't redefine it, so + // there's no need to look further. + return true; + } + + // Conservative answer. + return false; +} + +void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig, + const TargetRegisterInfo &TRI) const { + DebugLoc DL = Orig->getDebugLoc(); + + // MOV32r0 etc. are implemented with xor which clobbers condition code. + // Re-materialize them as movri instructions to avoid side effects. + bool Clone = true; + unsigned Opc = Orig->getOpcode(); + switch (Opc) { + default: break; + case X86::MOV8r0: + case X86::MOV16r0: + case X86::MOV32r0: + case X86::MOV64r0: { + if (!isSafeToClobberEFLAGS(MBB, I)) { + switch (Opc) { + default: break; + case X86::MOV8r0: Opc = X86::MOV8ri; break; + case X86::MOV16r0: Opc = X86::MOV16ri; break; + case X86::MOV32r0: Opc = X86::MOV32ri; break; + case X86::MOV64r0: Opc = X86::MOV64ri64i32; break; + } + Clone = false; + } + break; + } + } + + if (Clone) { + MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); + MBB.insert(I, MI); + } else { + BuildMI(MBB, I, DL, get(Opc)).addOperand(Orig->getOperand(0)).addImm(0); + } + + MachineInstr *NewMI = prior(I); + NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI); +} + +/// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that +/// is not marked dead. +static bool hasLiveCondCodeDef(MachineInstr *MI) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (MO.isReg() && MO.isDef() && + MO.getReg() == X86::EFLAGS && !MO.isDead()) { + return true; + } + } + return false; +} + +/// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when +/// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting +/// to a 32-bit superregister and then truncating back down to a 16-bit +/// subregister. +MachineInstr * +X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, + MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { + MachineInstr *MI = MBBI; + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Src = MI->getOperand(1).getReg(); + bool isDead = MI->getOperand(0).isDead(); + bool isKill = MI->getOperand(1).isKill(); + + unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit() + ? X86::LEA64_32r : X86::LEA32r; + MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo(); + unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); + unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass); + + // Build and insert into an implicit UNDEF value. This is OK because + // well be shifting and then extracting the lower 16-bits. + // This has the potential to cause partial register stall. e.g. + // movw (%rbp,%rcx,2), %dx + // leal -65(%rdx), %esi + // But testing has shown this *does* help performance in 64-bit mode (at + // least on modern x86 machines). + BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg); + MachineInstr *InsMI = + BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY)) + .addReg(leaInReg, RegState::Define, X86::sub_16bit) + .addReg(Src, getKillRegState(isKill)); + + MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(), + get(Opc), leaOutReg); + switch (MIOpc) { + default: + llvm_unreachable(0); + break; + case X86::SHL16ri: { + unsigned ShAmt = MI->getOperand(2).getImm(); + MIB.addReg(0).addImm(1 << ShAmt) + .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0); + break; + } + case X86::INC16r: + case X86::INC64_16r: + addRegOffset(MIB, leaInReg, true, 1); + break; + case X86::DEC16r: + case X86::DEC64_16r: + addRegOffset(MIB, leaInReg, true, -1); + break; + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri_DB: + case X86::ADD16ri8_DB: + addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm()); + break; + case X86::ADD16rr: + case X86::ADD16rr_DB: { + unsigned Src2 = MI->getOperand(2).getReg(); + bool isKill2 = MI->getOperand(2).isKill(); + unsigned leaInReg2 = 0; + MachineInstr *InsMI2 = 0; + if (Src == Src2) { + // ADD16rr %reg1028<kill>, %reg1028 + // just a single insert_subreg. + addRegReg(MIB, leaInReg, true, leaInReg, false); + } else { + leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); + // Build and insert into an implicit UNDEF value. This is OK because + // well be shifting and then extracting the lower 16-bits. + BuildMI(*MFI, MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2); + InsMI2 = + BuildMI(*MFI, MIB, MI->getDebugLoc(), get(TargetOpcode::COPY)) + .addReg(leaInReg2, RegState::Define, X86::sub_16bit) + .addReg(Src2, getKillRegState(isKill2)); + addRegReg(MIB, leaInReg, true, leaInReg2, true); + } + if (LV && isKill2 && InsMI2) + LV->replaceKillInstruction(Src2, MI, InsMI2); + break; + } + } + + MachineInstr *NewMI = MIB; + MachineInstr *ExtMI = + BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY)) + .addReg(Dest, RegState::Define | getDeadRegState(isDead)) + .addReg(leaOutReg, RegState::Kill, X86::sub_16bit); + + if (LV) { + // Update live variables + LV->getVarInfo(leaInReg).Kills.push_back(NewMI); + LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI); + if (isKill) + LV->replaceKillInstruction(Src, MI, InsMI); + if (isDead) + LV->replaceKillInstruction(Dest, MI, ExtMI); + } + + return ExtMI; +} + +/// convertToThreeAddress - This method must be implemented by targets that +/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target +/// may be able to convert a two-address instruction into a true +/// three-address instruction on demand. This allows the X86 target (for +/// example) to convert ADD and SHL instructions into LEA instructions if they +/// would require register copies due to two-addressness. +/// +/// This method returns a null pointer if the transformation cannot be +/// performed, otherwise it returns the new instruction. +/// +MachineInstr * +X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { + MachineInstr *MI = MBBI; + MachineFunction &MF = *MI->getParent()->getParent(); + // All instructions input are two-addr instructions. Get the known operands. + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Src = MI->getOperand(1).getReg(); + bool isDead = MI->getOperand(0).isDead(); + bool isKill = MI->getOperand(1).isKill(); + + MachineInstr *NewMI = NULL; + // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When + // we have better subtarget support, enable the 16-bit LEA generation here. + // 16-bit LEA is also slow on Core2. + bool DisableLEA16 = true; + bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); + + unsigned MIOpc = MI->getOpcode(); + switch (MIOpc) { + case X86::SHUFPSrri: { + assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!"); + if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0; + + unsigned B = MI->getOperand(1).getReg(); + unsigned C = MI->getOperand(2).getReg(); + if (B != C) return 0; + unsigned A = MI->getOperand(0).getReg(); + unsigned M = MI->getOperand(3).getImm(); + NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) + .addReg(A, RegState::Define | getDeadRegState(isDead)) + .addReg(B, getKillRegState(isKill)).addImm(M); + break; + } + case X86::SHL64ri: { + assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); + // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses + // the flags produced by a shift yet, so this is safe. + unsigned ShAmt = MI->getOperand(2).getImm(); + if (ShAmt == 0 || ShAmt >= 4) return 0; + + // LEA can't handle RSP. + if (TargetRegisterInfo::isVirtualRegister(Src) && + !MF.getRegInfo().constrainRegClass(Src, &X86::GR64_NOSPRegClass)) + return 0; + + NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) + .addReg(Dest, RegState::Define | getDeadRegState(isDead)) + .addReg(0).addImm(1 << ShAmt) + .addReg(Src, getKillRegState(isKill)) + .addImm(0).addReg(0); + break; + } + case X86::SHL32ri: { + assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); + // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses + // the flags produced by a shift yet, so this is safe. + unsigned ShAmt = MI->getOperand(2).getImm(); + if (ShAmt == 0 || ShAmt >= 4) return 0; + + // LEA can't handle ESP. + if (TargetRegisterInfo::isVirtualRegister(Src) && + !MF.getRegInfo().constrainRegClass(Src, &X86::GR32_NOSPRegClass)) + return 0; + + unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; + NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addReg(Dest, RegState::Define | getDeadRegState(isDead)) + .addReg(0).addImm(1 << ShAmt) + .addReg(Src, getKillRegState(isKill)).addImm(0).addReg(0); + break; + } + case X86::SHL16ri: { + assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); + // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses + // the flags produced by a shift yet, so this is safe. + unsigned ShAmt = MI->getOperand(2).getImm(); + if (ShAmt == 0 || ShAmt >= 4) return 0; + + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addReg(Dest, RegState::Define | getDeadRegState(isDead)) + .addReg(0).addImm(1 << ShAmt) + .addReg(Src, getKillRegState(isKill)) + .addImm(0).addReg(0); + break; + } + default: { + // The following opcodes also sets the condition code register(s). Only + // convert them to equivalent lea if the condition code register def's + // are dead! + if (hasLiveCondCodeDef(MI)) + return 0; + + switch (MIOpc) { + default: return 0; + case X86::INC64r: + case X86::INC32r: + case X86::INC64_32r: { + assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); + unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r + : (is64Bit ? X86::LEA64_32r : X86::LEA32r); + + // LEA can't handle RSP. + if (TargetRegisterInfo::isVirtualRegister(Src) && + !MF.getRegInfo().constrainRegClass(Src, + MIOpc == X86::INC64r ? X86::GR64_NOSPRegisterClass : + X86::GR32_NOSPRegisterClass)) + return 0; + + NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addReg(Dest, RegState::Define | + getDeadRegState(isDead)), + Src, isKill, 1); + break; + } + case X86::INC16r: + case X86::INC64_16r: + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); + NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addReg(Dest, RegState::Define | + getDeadRegState(isDead)), + Src, isKill, 1); + break; + case X86::DEC64r: + case X86::DEC32r: + case X86::DEC64_32r: { + assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); + unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r + : (is64Bit ? X86::LEA64_32r : X86::LEA32r); + // LEA can't handle RSP. + if (TargetRegisterInfo::isVirtualRegister(Src) && + !MF.getRegInfo().constrainRegClass(Src, + MIOpc == X86::DEC64r ? X86::GR64_NOSPRegisterClass : + X86::GR32_NOSPRegisterClass)) + return 0; + + NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addReg(Dest, RegState::Define | + getDeadRegState(isDead)), + Src, isKill, -1); + break; + } + case X86::DEC16r: + case X86::DEC64_16r: + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); + NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addReg(Dest, RegState::Define | + getDeadRegState(isDead)), + Src, isKill, -1); + break; + case X86::ADD64rr: + case X86::ADD64rr_DB: + case X86::ADD32rr: + case X86::ADD32rr_DB: { + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Opc; + TargetRegisterClass *RC; + if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) { + Opc = X86::LEA64r; + RC = X86::GR64_NOSPRegisterClass; + } else { + Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; + RC = X86::GR32_NOSPRegisterClass; + } + + + unsigned Src2 = MI->getOperand(2).getReg(); + bool isKill2 = MI->getOperand(2).isKill(); + + // LEA can't handle RSP. + if (TargetRegisterInfo::isVirtualRegister(Src2) && + !MF.getRegInfo().constrainRegClass(Src2, RC)) + return 0; + + NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addReg(Dest, RegState::Define | + getDeadRegState(isDead)), + Src, isKill, Src2, isKill2); + if (LV && isKill2) + LV->replaceKillInstruction(Src2, MI, NewMI); + break; + } + case X86::ADD16rr: + case X86::ADD16rr_DB: { + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Src2 = MI->getOperand(2).getReg(); + bool isKill2 = MI->getOperand(2).isKill(); + NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addReg(Dest, RegState::Define | + getDeadRegState(isDead)), + Src, isKill, Src2, isKill2); + if (LV && isKill2) + LV->replaceKillInstruction(Src2, MI, NewMI); + break; + } + case X86::ADD64ri32: + case X86::ADD64ri8: + case X86::ADD64ri32_DB: + case X86::ADD64ri8_DB: + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) + .addReg(Dest, RegState::Define | + getDeadRegState(isDead)), + Src, isKill, MI->getOperand(2).getImm()); + break; + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri_DB: + case X86::ADD32ri8_DB: { + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; + NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addReg(Dest, RegState::Define | + getDeadRegState(isDead)), + Src, isKill, MI->getOperand(2).getImm()); + break; + } + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri_DB: + case X86::ADD16ri8_DB: + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0; + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addReg(Dest, RegState::Define | + getDeadRegState(isDead)), + Src, isKill, MI->getOperand(2).getImm()); + break; + } + } + } + + if (!NewMI) return 0; + + if (LV) { // Update live variables + if (isKill) + LV->replaceKillInstruction(Src, MI, NewMI); + if (isDead) + LV->replaceKillInstruction(Dest, MI, NewMI); + } + + MFI->insert(MBBI, NewMI); // Insert the new inst + return NewMI; +} + +/// commuteInstruction - We have a few instructions that must be hacked on to +/// commute them. +/// +MachineInstr * +X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { + switch (MI->getOpcode()) { + case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I) + case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I) + case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I) + case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I) + case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I) + case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I) + unsigned Opc; + unsigned Size; + switch (MI->getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break; + case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break; + case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break; + case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break; + case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break; + case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break; + } + unsigned Amt = MI->getOperand(3).getImm(); + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->setDesc(get(Opc)); + MI->getOperand(3).setImm(Size-Amt); + return TargetInstrInfoImpl::commuteInstruction(MI, NewMI); + } + case X86::CMOVB16rr: + case X86::CMOVB32rr: + case X86::CMOVB64rr: + case X86::CMOVAE16rr: + case X86::CMOVAE32rr: + case X86::CMOVAE64rr: + case X86::CMOVE16rr: + case X86::CMOVE32rr: + case X86::CMOVE64rr: + case X86::CMOVNE16rr: + case X86::CMOVNE32rr: + case X86::CMOVNE64rr: + case X86::CMOVBE16rr: + case X86::CMOVBE32rr: + case X86::CMOVBE64rr: + case X86::CMOVA16rr: + case X86::CMOVA32rr: + case X86::CMOVA64rr: + case X86::CMOVL16rr: + case X86::CMOVL32rr: + case X86::CMOVL64rr: + case X86::CMOVGE16rr: + case X86::CMOVGE32rr: + case X86::CMOVGE64rr: + case X86::CMOVLE16rr: + case X86::CMOVLE32rr: + case X86::CMOVLE64rr: + case X86::CMOVG16rr: + case X86::CMOVG32rr: + case X86::CMOVG64rr: + case X86::CMOVS16rr: + case X86::CMOVS32rr: + case X86::CMOVS64rr: + case X86::CMOVNS16rr: + case X86::CMOVNS32rr: + case X86::CMOVNS64rr: + case X86::CMOVP16rr: + case X86::CMOVP32rr: + case X86::CMOVP64rr: + case X86::CMOVNP16rr: + case X86::CMOVNP32rr: + case X86::CMOVNP64rr: + case X86::CMOVO16rr: + case X86::CMOVO32rr: + case X86::CMOVO64rr: + case X86::CMOVNO16rr: + case X86::CMOVNO32rr: + case X86::CMOVNO64rr: { + unsigned Opc = 0; + switch (MI->getOpcode()) { + default: break; + case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break; + case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break; + case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break; + case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break; + case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break; + case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break; + case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break; + case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break; + case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break; + case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break; + case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break; + case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break; + case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break; + case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break; + case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break; + case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break; + case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break; + case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break; + case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break; + case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break; + case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break; + case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break; + case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break; + case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break; + case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break; + case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break; + case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break; + case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break; + case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break; + case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break; + case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break; + case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break; + case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break; + case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break; + case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break; + case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break; + case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break; + case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break; + case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break; + case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break; + case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break; + case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break; + case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break; + case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break; + case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break; + case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break; + case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break; + case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break; + } + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->setDesc(get(Opc)); + // Fallthrough intended. + } + default: + return TargetInstrInfoImpl::commuteInstruction(MI, NewMI); + } +} + +static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) { + switch (BrOpc) { + default: return X86::COND_INVALID; + case X86::JE_4: return X86::COND_E; + case X86::JNE_4: return X86::COND_NE; + case X86::JL_4: return X86::COND_L; + case X86::JLE_4: return X86::COND_LE; + case X86::JG_4: return X86::COND_G; + case X86::JGE_4: return X86::COND_GE; + case X86::JB_4: return X86::COND_B; + case X86::JBE_4: return X86::COND_BE; + case X86::JA_4: return X86::COND_A; + case X86::JAE_4: return X86::COND_AE; + case X86::JS_4: return X86::COND_S; + case X86::JNS_4: return X86::COND_NS; + case X86::JP_4: return X86::COND_P; + case X86::JNP_4: return X86::COND_NP; + case X86::JO_4: return X86::COND_O; + case X86::JNO_4: return X86::COND_NO; + } +} + +unsigned X86::GetCondBranchFromCond(X86::CondCode CC) { + switch (CC) { + default: llvm_unreachable("Illegal condition code!"); + case X86::COND_E: return X86::JE_4; + case X86::COND_NE: return X86::JNE_4; + case X86::COND_L: return X86::JL_4; + case X86::COND_LE: return X86::JLE_4; + case X86::COND_G: return X86::JG_4; + case X86::COND_GE: return X86::JGE_4; + case X86::COND_B: return X86::JB_4; + case X86::COND_BE: return X86::JBE_4; + case X86::COND_A: return X86::JA_4; + case X86::COND_AE: return X86::JAE_4; + case X86::COND_S: return X86::JS_4; + case X86::COND_NS: return X86::JNS_4; + case X86::COND_P: return X86::JP_4; + case X86::COND_NP: return X86::JNP_4; + case X86::COND_O: return X86::JO_4; + case X86::COND_NO: return X86::JNO_4; + } +} + +/// GetOppositeBranchCondition - Return the inverse of the specified condition, +/// e.g. turning COND_E to COND_NE. +X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { + switch (CC) { + default: llvm_unreachable("Illegal condition code!"); + case X86::COND_E: return X86::COND_NE; + case X86::COND_NE: return X86::COND_E; + case X86::COND_L: return X86::COND_GE; + case X86::COND_LE: return X86::COND_G; + case X86::COND_G: return X86::COND_LE; + case X86::COND_GE: return X86::COND_L; + case X86::COND_B: return X86::COND_AE; + case X86::COND_BE: return X86::COND_A; + case X86::COND_A: return X86::COND_BE; + case X86::COND_AE: return X86::COND_B; + case X86::COND_S: return X86::COND_NS; + case X86::COND_NS: return X86::COND_S; + case X86::COND_P: return X86::COND_NP; + case X86::COND_NP: return X86::COND_P; + case X86::COND_O: return X86::COND_NO; + case X86::COND_NO: return X86::COND_O; + } +} + +bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { + const MCInstrDesc &MCID = MI->getDesc(); + if (!MCID.isTerminator()) return false; + + // Conditional branch is a special case. + if (MCID.isBranch() && !MCID.isBarrier()) + return true; + if (!MCID.isPredicable()) + return true; + return !isPredicated(MI); +} + +bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + // Start from the bottom of the block and work up, examining the + // terminator instructions. + MachineBasicBlock::iterator I = MBB.end(); + MachineBasicBlock::iterator UnCondBrIter = MBB.end(); + while (I != MBB.begin()) { + --I; + if (I->isDebugValue()) + continue; + + // Working from the bottom, when we see a non-terminator instruction, we're + // done. + if (!isUnpredicatedTerminator(I)) + break; + + // A terminator that isn't a branch can't easily be handled by this + // analysis. + if (!I->getDesc().isBranch()) + return true; + + // Handle unconditional branches. + if (I->getOpcode() == X86::JMP_4) { + UnCondBrIter = I; + + if (!AllowModify) { + TBB = I->getOperand(0).getMBB(); + continue; + } + + // If the block has any instructions after a JMP, delete them. + while (llvm::next(I) != MBB.end()) + llvm::next(I)->eraseFromParent(); + + Cond.clear(); + FBB = 0; + + // Delete the JMP if it's equivalent to a fall-through. + if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) { + TBB = 0; + I->eraseFromParent(); + I = MBB.end(); + UnCondBrIter = MBB.end(); + continue; + } + + // TBB is used to indicate the unconditional destination. + TBB = I->getOperand(0).getMBB(); + continue; + } + + // Handle conditional branches. + X86::CondCode BranchCode = GetCondFromBranchOpc(I->getOpcode()); + if (BranchCode == X86::COND_INVALID) + return true; // Can't handle indirect branch. + + // Working from the bottom, handle the first conditional branch. + if (Cond.empty()) { + MachineBasicBlock *TargetBB = I->getOperand(0).getMBB(); + if (AllowModify && UnCondBrIter != MBB.end() && + MBB.isLayoutSuccessor(TargetBB)) { + // If we can modify the code and it ends in something like: + // + // jCC L1 + // jmp L2 + // L1: + // ... + // L2: + // + // Then we can change this to: + // + // jnCC L2 + // L1: + // ... + // L2: + // + // Which is a bit more efficient. + // We conditionally jump to the fall-through block. + BranchCode = GetOppositeBranchCondition(BranchCode); + unsigned JNCC = GetCondBranchFromCond(BranchCode); + MachineBasicBlock::iterator OldInst = I; + + BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC)) + .addMBB(UnCondBrIter->getOperand(0).getMBB()); + BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_4)) + .addMBB(TargetBB); + + OldInst->eraseFromParent(); + UnCondBrIter->eraseFromParent(); + + // Restart the analysis. + UnCondBrIter = MBB.end(); + I = MBB.end(); + continue; + } + + FBB = TBB; + TBB = I->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(BranchCode)); + continue; + } + + // Handle subsequent conditional branches. Only handle the case where all + // conditional branches branch to the same destination and their condition + // opcodes fit one of the special multi-branch idioms. + assert(Cond.size() == 1); + assert(TBB); + + // Only handle the case where all conditional branches branch to the same + // destination. + if (TBB != I->getOperand(0).getMBB()) + return true; + + // If the conditions are the same, we can leave them alone. + X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm(); + if (OldBranchCode == BranchCode) + continue; + + // If they differ, see if they fit one of the known patterns. Theoretically, + // we could handle more patterns here, but we shouldn't expect to see them + // if instruction selection has done a reasonable job. + if ((OldBranchCode == X86::COND_NP && + BranchCode == X86::COND_E) || + (OldBranchCode == X86::COND_E && + BranchCode == X86::COND_NP)) + BranchCode = X86::COND_NP_OR_E; + else if ((OldBranchCode == X86::COND_P && + BranchCode == X86::COND_NE) || + (OldBranchCode == X86::COND_NE && + BranchCode == X86::COND_P)) + BranchCode = X86::COND_NE_OR_P; + else + return true; + + // Update the MachineOperand. + Cond[0].setImm(BranchCode); + } + + return false; +} + +unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.end(); + unsigned Count = 0; + + while (I != MBB.begin()) { + --I; + if (I->isDebugValue()) + continue; + if (I->getOpcode() != X86::JMP_4 && + GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) + break; + // Remove the branch. + I->eraseFromParent(); + I = MBB.end(); + ++Count; + } + + return Count; +} + +unsigned +X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const { + // Shouldn't be a fall through. + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + assert((Cond.size() == 1 || Cond.size() == 0) && + "X86 branch conditions have one component!"); + + if (Cond.empty()) { + // Unconditional branch? + assert(!FBB && "Unconditional branch with multiple successors!"); + BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(TBB); + return 1; + } + + // Conditional branch. + unsigned Count = 0; + X86::CondCode CC = (X86::CondCode)Cond[0].getImm(); + switch (CC) { + case X86::COND_NP_OR_E: + // Synthesize NP_OR_E with two branches. + BuildMI(&MBB, DL, get(X86::JNP_4)).addMBB(TBB); + ++Count; + BuildMI(&MBB, DL, get(X86::JE_4)).addMBB(TBB); + ++Count; + break; + case X86::COND_NE_OR_P: + // Synthesize NE_OR_P with two branches. + BuildMI(&MBB, DL, get(X86::JNE_4)).addMBB(TBB); + ++Count; + BuildMI(&MBB, DL, get(X86::JP_4)).addMBB(TBB); + ++Count; + break; + default: { + unsigned Opc = GetCondBranchFromCond(CC); + BuildMI(&MBB, DL, get(Opc)).addMBB(TBB); + ++Count; + } + } + if (FBB) { + // Two-way Conditional branch. Insert the second branch. + BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(FBB); + ++Count; + } + return Count; +} + +/// isHReg - Test if the given register is a physical h register. +static bool isHReg(unsigned Reg) { + return X86::GR8_ABCD_HRegClass.contains(Reg); +} + +// Try and copy between VR128/VR64 and GR64 registers. +static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, + bool HasAVX) { + // SrcReg(VR128) -> DestReg(GR64) + // SrcReg(VR64) -> DestReg(GR64) + // SrcReg(GR64) -> DestReg(VR128) + // SrcReg(GR64) -> DestReg(VR64) + + if (X86::GR64RegClass.contains(DestReg)) { + if (X86::VR128RegClass.contains(SrcReg)) { + // Copy from a VR128 register to a GR64 register. + return HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr; + } else if (X86::VR64RegClass.contains(SrcReg)) { + // Copy from a VR64 register to a GR64 register. + return X86::MOVSDto64rr; + } + } else if (X86::GR64RegClass.contains(SrcReg)) { + // Copy from a GR64 register to a VR128 register. + if (X86::VR128RegClass.contains(DestReg)) + return HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr; + // Copy from a GR64 register to a VR64 register. + else if (X86::VR64RegClass.contains(DestReg)) + return X86::MOV64toSDrr; + } + + // SrcReg(FR32) -> DestReg(GR32) + // SrcReg(GR32) -> DestReg(FR32) + + if (X86::GR32RegClass.contains(DestReg) && X86::FR32RegClass.contains(SrcReg)) + // Copy from a FR32 register to a GR32 register. + return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr; + + if (X86::FR32RegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg)) + // Copy from a GR32 register to a FR32 register. + return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr; + + return 0; +} + +void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + // First deal with the normal symmetric copies. + bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + unsigned Opc = 0; + if (X86::GR64RegClass.contains(DestReg, SrcReg)) + Opc = X86::MOV64rr; + else if (X86::GR32RegClass.contains(DestReg, SrcReg)) + Opc = X86::MOV32rr; + else if (X86::GR16RegClass.contains(DestReg, SrcReg)) + Opc = X86::MOV16rr; + else if (X86::GR8RegClass.contains(DestReg, SrcReg)) { + // Copying to or from a physical H register on x86-64 requires a NOREX + // move. Otherwise use a normal move. + if ((isHReg(DestReg) || isHReg(SrcReg)) && + TM.getSubtarget<X86Subtarget>().is64Bit()) { + Opc = X86::MOV8rr_NOREX; + // Both operands must be encodable without an REX prefix. + assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) && + "8-bit H register can not be copied outside GR8_NOREX"); + } else + Opc = X86::MOV8rr; + } else if (X86::VR128RegClass.contains(DestReg, SrcReg)) + Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; + else if (X86::VR256RegClass.contains(DestReg, SrcReg)) + Opc = X86::VMOVAPSYrr; + else if (X86::VR64RegClass.contains(DestReg, SrcReg)) + Opc = X86::MMX_MOVQ64rr; + else + Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, HasAVX); + + if (Opc) { + BuildMI(MBB, MI, DL, get(Opc), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + + // Moving EFLAGS to / from another register requires a push and a pop. + if (SrcReg == X86::EFLAGS) { + if (X86::GR64RegClass.contains(DestReg)) { + BuildMI(MBB, MI, DL, get(X86::PUSHF64)); + BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg); + return; + } else if (X86::GR32RegClass.contains(DestReg)) { + BuildMI(MBB, MI, DL, get(X86::PUSHF32)); + BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg); + return; + } + } + if (DestReg == X86::EFLAGS) { + if (X86::GR64RegClass.contains(SrcReg)) { + BuildMI(MBB, MI, DL, get(X86::PUSH64r)) + .addReg(SrcReg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(X86::POPF64)); + return; + } else if (X86::GR32RegClass.contains(SrcReg)) { + BuildMI(MBB, MI, DL, get(X86::PUSH32r)) + .addReg(SrcReg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(X86::POPF32)); + return; + } + } + + DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) + << " to " << RI.getName(DestReg) << '\n'); + llvm_unreachable("Cannot emit physreg copy instruction"); +} + +static unsigned getLoadStoreRegOpcode(unsigned Reg, + const TargetRegisterClass *RC, + bool isStackAligned, + const TargetMachine &TM, + bool load) { + bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + switch (RC->getSize()) { + default: + llvm_unreachable("Unknown spill size"); + case 1: + assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass"); + if (TM.getSubtarget<X86Subtarget>().is64Bit()) + // Copying to or from a physical H register on x86-64 requires a NOREX + // move. Otherwise use a normal move. + if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC)) + return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; + return load ? X86::MOV8rm : X86::MOV8mr; + case 2: + assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass"); + return load ? X86::MOV16rm : X86::MOV16mr; + case 4: + if (X86::GR32RegClass.hasSubClassEq(RC)) + return load ? X86::MOV32rm : X86::MOV32mr; + if (X86::FR32RegClass.hasSubClassEq(RC)) + return load ? + (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) : + (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr); + if (X86::RFP32RegClass.hasSubClassEq(RC)) + return load ? X86::LD_Fp32m : X86::ST_Fp32m; + llvm_unreachable("Unknown 4-byte regclass"); + case 8: + if (X86::GR64RegClass.hasSubClassEq(RC)) + return load ? X86::MOV64rm : X86::MOV64mr; + if (X86::FR64RegClass.hasSubClassEq(RC)) + return load ? + (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) : + (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr); + if (X86::VR64RegClass.hasSubClassEq(RC)) + return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; + if (X86::RFP64RegClass.hasSubClassEq(RC)) + return load ? X86::LD_Fp64m : X86::ST_Fp64m; + llvm_unreachable("Unknown 8-byte regclass"); + case 10: + assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); + return load ? X86::LD_Fp80m : X86::ST_FpP80m; + case 16: { + assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass"); + // If stack is realigned we can use aligned stores. + if (isStackAligned) + return load ? + (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) : + (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr); + else + return load ? + (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) : + (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); + } + case 32: + assert(X86::VR256RegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass"); + // If stack is realigned we can use aligned stores. + if (isStackAligned) + return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr; + else + return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr; + } +} + +static unsigned getStoreRegOpcode(unsigned SrcReg, + const TargetRegisterClass *RC, + bool isStackAligned, + TargetMachine &TM) { + return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, TM, false); +} + + +static unsigned getLoadRegOpcode(unsigned DestReg, + const TargetRegisterClass *RC, + bool isStackAligned, + const TargetMachine &TM) { + return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, TM, true); +} + +void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIdx, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + const MachineFunction &MF = *MBB.getParent(); + assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() && + "Stack slot too small for store"); + unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); + unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM); + DebugLoc DL = MBB.findDebugLoc(MI); + addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx) + .addReg(SrcReg, getKillRegState(isKill)); +} + +void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg, + bool isKill, + SmallVectorImpl<MachineOperand> &Addr, + const TargetRegisterClass *RC, + MachineInstr::mmo_iterator MMOBegin, + MachineInstr::mmo_iterator MMOEnd, + SmallVectorImpl<MachineInstr*> &NewMIs) const { + unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + bool isAligned = MMOBegin != MMOEnd && + (*MMOBegin)->getAlignment() >= Alignment; + unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM); + DebugLoc DL; + MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc)); + for (unsigned i = 0, e = Addr.size(); i != e; ++i) + MIB.addOperand(Addr[i]); + MIB.addReg(SrcReg, getKillRegState(isKill)); + (*MIB).setMemRefs(MMOBegin, MMOEnd); + NewMIs.push_back(MIB); +} + + +void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIdx, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + const MachineFunction &MF = *MBB.getParent(); + unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); + unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM); + DebugLoc DL = MBB.findDebugLoc(MI); + addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx); +} + +void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, + SmallVectorImpl<MachineOperand> &Addr, + const TargetRegisterClass *RC, + MachineInstr::mmo_iterator MMOBegin, + MachineInstr::mmo_iterator MMOEnd, + SmallVectorImpl<MachineInstr*> &NewMIs) const { + unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + bool isAligned = MMOBegin != MMOEnd && + (*MMOBegin)->getAlignment() >= Alignment; + unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM); + DebugLoc DL; + MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg); + for (unsigned i = 0, e = Addr.size(); i != e; ++i) + MIB.addOperand(Addr[i]); + (*MIB).setMemRefs(MMOBegin, MMOEnd); + NewMIs.push_back(MIB); +} + +/// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr +/// instruction with two undef reads of the register being defined. This is +/// used for mapping: +/// %xmm4 = V_SET0 +/// to: +/// %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef> +/// +static bool Expand2AddrUndef(MachineInstr *MI, const MCInstrDesc &Desc) { + assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); + unsigned Reg = MI->getOperand(0).getReg(); + MI->setDesc(Desc); + + // MachineInstr::addOperand() will insert explicit operands before any + // implicit operands. + MachineInstrBuilder(MI).addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + // But we don't trust that. + assert(MI->getOperand(1).getReg() == Reg && + MI->getOperand(2).getReg() == Reg && "Misplaced operand"); + return true; +} + +bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX(); + switch (MI->getOpcode()) { + case X86::V_SET0: + return Expand2AddrUndef(MI, get(HasAVX ? X86::VPXORrr : X86::PXORrr)); + case X86::TEST8ri_NOREX: + MI->setDesc(get(X86::TEST8ri)); + return true; + } + return false; +} + +MachineInstr* +X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, + int FrameIx, uint64_t Offset, + const MDNode *MDPtr, + DebugLoc DL) const { + X86AddressMode AM; + AM.BaseType = X86AddressMode::FrameIndexBase; + AM.Base.FrameIndex = FrameIx; + MachineInstrBuilder MIB = BuildMI(MF, DL, get(X86::DBG_VALUE)); + addFullAddress(MIB, AM).addImm(Offset).addMetadata(MDPtr); + return &*MIB; +} + +static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, + const SmallVectorImpl<MachineOperand> &MOs, + MachineInstr *MI, + const TargetInstrInfo &TII) { + // Create the base instruction with the memory operand as the first part. + MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), + MI->getDebugLoc(), true); + MachineInstrBuilder MIB(NewMI); + unsigned NumAddrOps = MOs.size(); + for (unsigned i = 0; i != NumAddrOps; ++i) + MIB.addOperand(MOs[i]); + if (NumAddrOps < 4) // FrameIndex only + addOffset(MIB, 0); + + // Loop over the rest of the ri operands, converting them over. + unsigned NumOps = MI->getDesc().getNumOperands()-2; + for (unsigned i = 0; i != NumOps; ++i) { + MachineOperand &MO = MI->getOperand(i+2); + MIB.addOperand(MO); + } + for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + MIB.addOperand(MO); + } + return MIB; +} + +static MachineInstr *FuseInst(MachineFunction &MF, + unsigned Opcode, unsigned OpNo, + const SmallVectorImpl<MachineOperand> &MOs, + MachineInstr *MI, const TargetInstrInfo &TII) { + MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), + MI->getDebugLoc(), true); + MachineInstrBuilder MIB(NewMI); + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (i == OpNo) { + assert(MO.isReg() && "Expected to fold into reg operand!"); + unsigned NumAddrOps = MOs.size(); + for (unsigned i = 0; i != NumAddrOps; ++i) + MIB.addOperand(MOs[i]); + if (NumAddrOps < 4) // FrameIndex only + addOffset(MIB, 0); + } else { + MIB.addOperand(MO); + } + } + return MIB; +} + +static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, + const SmallVectorImpl<MachineOperand> &MOs, + MachineInstr *MI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode)); + + unsigned NumAddrOps = MOs.size(); + for (unsigned i = 0; i != NumAddrOps; ++i) + MIB.addOperand(MOs[i]); + if (NumAddrOps < 4) // FrameIndex only + addOffset(MIB, 0); + return MIB.addImm(0); +} + +MachineInstr* +X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, unsigned i, + const SmallVectorImpl<MachineOperand> &MOs, + unsigned Size, unsigned Align) const { + const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0; + bool isTwoAddrFold = false; + unsigned NumOps = MI->getDesc().getNumOperands(); + bool isTwoAddr = NumOps > 1 && + MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; + + // FIXME: AsmPrinter doesn't know how to handle + // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. + if (MI->getOpcode() == X86::ADD32ri && + MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) + return NULL; + + MachineInstr *NewMI = NULL; + // Folding a memory location into the two-address part of a two-address + // instruction is different than folding it other places. It requires + // replacing the *two* registers with the memory location. + if (isTwoAddr && NumOps >= 2 && i < 2 && + MI->getOperand(0).isReg() && + MI->getOperand(1).isReg() && + MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) { + OpcodeTablePtr = &RegOp2MemOpTable2Addr; + isTwoAddrFold = true; + } else if (i == 0) { // If operand 0 + if (MI->getOpcode() == X86::MOV64r0) + NewMI = MakeM0Inst(*this, X86::MOV64mi32, MOs, MI); + else if (MI->getOpcode() == X86::MOV32r0) + NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI); + else if (MI->getOpcode() == X86::MOV16r0) + NewMI = MakeM0Inst(*this, X86::MOV16mi, MOs, MI); + else if (MI->getOpcode() == X86::MOV8r0) + NewMI = MakeM0Inst(*this, X86::MOV8mi, MOs, MI); + if (NewMI) + return NewMI; + + OpcodeTablePtr = &RegOp2MemOpTable0; + } else if (i == 1) { + OpcodeTablePtr = &RegOp2MemOpTable1; + } else if (i == 2) { + OpcodeTablePtr = &RegOp2MemOpTable2; + } + + // If table selected... + if (OpcodeTablePtr) { + // Find the Opcode to fuse + DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = + OpcodeTablePtr->find(MI->getOpcode()); + if (I != OpcodeTablePtr->end()) { + unsigned Opcode = I->second.first; + unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT; + if (Align < MinAlign) + return NULL; + bool NarrowToMOV32rm = false; + if (Size) { + unsigned RCSize = getRegClass(MI->getDesc(), i, &RI)->getSize(); + if (Size < RCSize) { + // Check if it's safe to fold the load. If the size of the object is + // narrower than the load width, then it's not. + if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) + return NULL; + // If this is a 64-bit load, but the spill slot is 32, then we can do + // a 32-bit load which is implicitly zero-extended. This likely is due + // to liveintervalanalysis remat'ing a load from stack slot. + if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg()) + return NULL; + Opcode = X86::MOV32rm; + NarrowToMOV32rm = true; + } + } + + if (isTwoAddrFold) + NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this); + else + NewMI = FuseInst(MF, Opcode, i, MOs, MI, *this); + + if (NarrowToMOV32rm) { + // If this is the special case where we use a MOV32rm to load a 32-bit + // value and zero-extend the top bits. Change the destination register + // to a 32-bit one. + unsigned DstReg = NewMI->getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, + X86::sub_32bit)); + else + NewMI->getOperand(0).setSubReg(X86::sub_32bit); + } + return NewMI; + } + } + + // No fusion + if (PrintFailedFusing && !MI->isCopy()) + dbgs() << "We failed to fuse operand " << i << " in " << *MI; + return NULL; +} + +/// hasPartialRegUpdate - Return true for all instructions that only update +/// the first 32 or 64-bits of the destination register and leave the rest +/// unmodified. This can be used to avoid folding loads if the instructions +/// only update part of the destination register, and the non-updated part is +/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these +/// instructions breaks the partial register dependency and it can improve +/// performance. e.g.: +/// +/// movss (%rdi), %xmm0 +/// cvtss2sd %xmm0, %xmm0 +/// +/// Instead of +/// cvtss2sd (%rdi), %xmm0 +/// +/// FIXME: This should be turned into a TSFlags. +/// +static bool hasPartialRegUpdate(unsigned Opcode) { + switch (Opcode) { + case X86::CVTSD2SSrr: + case X86::Int_CVTSD2SSrr: + case X86::CVTSS2SDrr: + case X86::Int_CVTSS2SDrr: + case X86::RCPSSr: + case X86::RCPSSr_Int: + case X86::ROUNDSDr: + case X86::ROUNDSSr: + case X86::RSQRTSSr: + case X86::RSQRTSSr_Int: + case X86::SQRTSSr: + case X86::SQRTSSr_Int: + // AVX encoded versions + case X86::VCVTSD2SSrr: + case X86::Int_VCVTSD2SSrr: + case X86::VCVTSS2SDrr: + case X86::Int_VCVTSS2SDrr: + case X86::VRCPSSr: + case X86::VROUNDSDr: + case X86::VROUNDSSr: + case X86::VRSQRTSSr: + case X86::VSQRTSSr: + return true; + } + + return false; +} + +MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, + int FrameIndex) const { + // Check switch flag + if (NoFusing) return NULL; + + // Unless optimizing for size, don't fold to avoid partial + // register update stalls + if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize) && + hasPartialRegUpdate(MI->getOpcode())) + return 0; + + const MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned Size = MFI->getObjectSize(FrameIndex); + unsigned Alignment = MFI->getObjectAlignment(FrameIndex); + if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { + unsigned NewOpc = 0; + unsigned RCSize = 0; + switch (MI->getOpcode()) { + default: return NULL; + case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break; + case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break; + case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break; + case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break; + } + // Check if it's safe to fold the load. If the size of the object is + // narrower than the load width, then it's not. + if (Size < RCSize) + return NULL; + // Change to CMPXXri r, 0 first. + MI->setDesc(get(NewOpc)); + MI->getOperand(1).ChangeToImmediate(0); + } else if (Ops.size() != 1) + return NULL; + + SmallVector<MachineOperand,4> MOs; + MOs.push_back(MachineOperand::CreateFI(FrameIndex)); + return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment); +} + +MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, + MachineInstr *LoadMI) const { + // Check switch flag + if (NoFusing) return NULL; + + // Unless optimizing for size, don't fold to avoid partial + // register update stalls + if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize) && + hasPartialRegUpdate(MI->getOpcode())) + return 0; + + // Determine the alignment of the load. + unsigned Alignment = 0; + if (LoadMI->hasOneMemOperand()) + Alignment = (*LoadMI->memoperands_begin())->getAlignment(); + else + switch (LoadMI->getOpcode()) { + case X86::AVX_SET0PSY: + case X86::AVX_SET0PDY: + Alignment = 32; + break; + case X86::V_SET0: + case X86::V_SETALLONES: + case X86::AVX_SETALLONES: + Alignment = 16; + break; + case X86::FsFLD0SD: + case X86::VFsFLD0SD: + Alignment = 8; + break; + case X86::FsFLD0SS: + case X86::VFsFLD0SS: + Alignment = 4; + break; + default: + return 0; + } + if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { + unsigned NewOpc = 0; + switch (MI->getOpcode()) { + default: return NULL; + case X86::TEST8rr: NewOpc = X86::CMP8ri; break; + case X86::TEST16rr: NewOpc = X86::CMP16ri8; break; + case X86::TEST32rr: NewOpc = X86::CMP32ri8; break; + case X86::TEST64rr: NewOpc = X86::CMP64ri8; break; + } + // Change to CMPXXri r, 0 first. + MI->setDesc(get(NewOpc)); + MI->getOperand(1).ChangeToImmediate(0); + } else if (Ops.size() != 1) + return NULL; + + // Make sure the subregisters match. + // Otherwise we risk changing the size of the load. + if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg()) + return NULL; + + SmallVector<MachineOperand,X86::AddrNumOperands> MOs; + switch (LoadMI->getOpcode()) { + case X86::V_SET0: + case X86::V_SETALLONES: + case X86::AVX_SET0PSY: + case X86::AVX_SET0PDY: + case X86::AVX_SETALLONES: + case X86::FsFLD0SD: + case X86::FsFLD0SS: + case X86::VFsFLD0SD: + case X86::VFsFLD0SS: { + // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. + // Create a constant-pool entry and operands to load from it. + + // Medium and large mode can't fold loads this way. + if (TM.getCodeModel() != CodeModel::Small && + TM.getCodeModel() != CodeModel::Kernel) + return NULL; + + // x86-32 PIC requires a PIC base register for constant pools. + unsigned PICBase = 0; + if (TM.getRelocationModel() == Reloc::PIC_) { + if (TM.getSubtarget<X86Subtarget>().is64Bit()) + PICBase = X86::RIP; + else + // FIXME: PICBase = getGlobalBaseReg(&MF); + // This doesn't work for several reasons. + // 1. GlobalBaseReg may have been spilled. + // 2. It may not be live at MI. + return NULL; + } + + // Create a constant-pool entry. + MachineConstantPool &MCP = *MF.getConstantPool(); + Type *Ty; + unsigned Opc = LoadMI->getOpcode(); + if (Opc == X86::FsFLD0SS || Opc == X86::VFsFLD0SS) + Ty = Type::getFloatTy(MF.getFunction()->getContext()); + else if (Opc == X86::FsFLD0SD || Opc == X86::VFsFLD0SD) + Ty = Type::getDoubleTy(MF.getFunction()->getContext()); + else if (Opc == X86::AVX_SET0PSY || Opc == X86::AVX_SET0PDY) + Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8); + else + Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); + + bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES); + const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : + Constant::getNullValue(Ty); + unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); + + // Create operands to load from the constant pool entry. + MOs.push_back(MachineOperand::CreateReg(PICBase, false)); + MOs.push_back(MachineOperand::CreateImm(1)); + MOs.push_back(MachineOperand::CreateReg(0, false)); + MOs.push_back(MachineOperand::CreateCPI(CPI, 0)); + MOs.push_back(MachineOperand::CreateReg(0, false)); + break; + } + default: { + // Folding a normal load. Just copy the load's address operands. + unsigned NumOps = LoadMI->getDesc().getNumOperands(); + for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) + MOs.push_back(LoadMI->getOperand(i)); + break; + } + } + return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment); +} + + +bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops) const { + // Check switch flag + if (NoFusing) return 0; + + if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { + switch (MI->getOpcode()) { + default: return false; + case X86::TEST8rr: + case X86::TEST16rr: + case X86::TEST32rr: + case X86::TEST64rr: + return true; + case X86::ADD32ri: + // FIXME: AsmPrinter doesn't know how to handle + // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. + if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) + return false; + break; + } + } + + if (Ops.size() != 1) + return false; + + unsigned OpNum = Ops[0]; + unsigned Opc = MI->getOpcode(); + unsigned NumOps = MI->getDesc().getNumOperands(); + bool isTwoAddr = NumOps > 1 && + MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; + + // Folding a memory location into the two-address part of a two-address + // instruction is different than folding it other places. It requires + // replacing the *two* registers with the memory location. + const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0; + if (isTwoAddr && NumOps >= 2 && OpNum < 2) { + OpcodeTablePtr = &RegOp2MemOpTable2Addr; + } else if (OpNum == 0) { // If operand 0 + switch (Opc) { + case X86::MOV8r0: + case X86::MOV16r0: + case X86::MOV32r0: + case X86::MOV64r0: return true; + default: break; + } + OpcodeTablePtr = &RegOp2MemOpTable0; + } else if (OpNum == 1) { + OpcodeTablePtr = &RegOp2MemOpTable1; + } else if (OpNum == 2) { + OpcodeTablePtr = &RegOp2MemOpTable2; + } + + if (OpcodeTablePtr && OpcodeTablePtr->count(Opc)) + return true; + return TargetInstrInfoImpl::canFoldMemoryOperand(MI, Ops); +} + +bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, bool UnfoldStore, + SmallVectorImpl<MachineInstr*> &NewMIs) const { + DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = + MemOp2RegOpTable.find(MI->getOpcode()); + if (I == MemOp2RegOpTable.end()) + return false; + unsigned Opc = I->second.first; + unsigned Index = I->second.second & TB_INDEX_MASK; + bool FoldedLoad = I->second.second & TB_FOLDED_LOAD; + bool FoldedStore = I->second.second & TB_FOLDED_STORE; + if (UnfoldLoad && !FoldedLoad) + return false; + UnfoldLoad &= FoldedLoad; + if (UnfoldStore && !FoldedStore) + return false; + UnfoldStore &= FoldedStore; + + const MCInstrDesc &MCID = get(Opc); + const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI); + if (!MI->hasOneMemOperand() && + RC == &X86::VR128RegClass && + !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast()) + // Without memoperands, loadRegFromAddr and storeRegToStackSlot will + // conservatively assume the address is unaligned. That's bad for + // performance. + return false; + SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps; + SmallVector<MachineOperand,2> BeforeOps; + SmallVector<MachineOperand,2> AfterOps; + SmallVector<MachineOperand,4> ImpOps; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (i >= Index && i < Index + X86::AddrNumOperands) + AddrOps.push_back(Op); + else if (Op.isReg() && Op.isImplicit()) + ImpOps.push_back(Op); + else if (i < Index) + BeforeOps.push_back(Op); + else if (i > Index) + AfterOps.push_back(Op); + } + + // Emit the load instruction. + if (UnfoldLoad) { + std::pair<MachineInstr::mmo_iterator, + MachineInstr::mmo_iterator> MMOs = + MF.extractLoadMemRefs(MI->memoperands_begin(), + MI->memoperands_end()); + loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs); + if (UnfoldStore) { + // Address operands cannot be marked isKill. + for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) { + MachineOperand &MO = NewMIs[0]->getOperand(i); + if (MO.isReg()) + MO.setIsKill(false); + } + } + } + + // Emit the data processing instruction. + MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI->getDebugLoc(), true); + MachineInstrBuilder MIB(DataMI); + + if (FoldedStore) + MIB.addReg(Reg, RegState::Define); + for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i) + MIB.addOperand(BeforeOps[i]); + if (FoldedLoad) + MIB.addReg(Reg); + for (unsigned i = 0, e = AfterOps.size(); i != e; ++i) + MIB.addOperand(AfterOps[i]); + for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) { + MachineOperand &MO = ImpOps[i]; + MIB.addReg(MO.getReg(), + getDefRegState(MO.isDef()) | + RegState::Implicit | + getKillRegState(MO.isKill()) | + getDeadRegState(MO.isDead()) | + getUndefRegState(MO.isUndef())); + } + // Change CMP32ri r, 0 back to TEST32rr r, r, etc. + unsigned NewOpc = 0; + switch (DataMI->getOpcode()) { + default: break; + case X86::CMP64ri32: + case X86::CMP64ri8: + case X86::CMP32ri: + case X86::CMP32ri8: + case X86::CMP16ri: + case X86::CMP16ri8: + case X86::CMP8ri: { + MachineOperand &MO0 = DataMI->getOperand(0); + MachineOperand &MO1 = DataMI->getOperand(1); + if (MO1.getImm() == 0) { + switch (DataMI->getOpcode()) { + default: break; + case X86::CMP64ri8: + case X86::CMP64ri32: NewOpc = X86::TEST64rr; break; + case X86::CMP32ri8: + case X86::CMP32ri: NewOpc = X86::TEST32rr; break; + case X86::CMP16ri8: + case X86::CMP16ri: NewOpc = X86::TEST16rr; break; + case X86::CMP8ri: NewOpc = X86::TEST8rr; break; + } + DataMI->setDesc(get(NewOpc)); + MO1.ChangeToRegister(MO0.getReg(), false); + } + } + } + NewMIs.push_back(DataMI); + + // Emit the store instruction. + if (UnfoldStore) { + const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI); + std::pair<MachineInstr::mmo_iterator, + MachineInstr::mmo_iterator> MMOs = + MF.extractStoreMemRefs(MI->memoperands_begin(), + MI->memoperands_end()); + storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs); + } + + return true; +} + +bool +X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl<SDNode*> &NewNodes) const { + if (!N->isMachineOpcode()) + return false; + + DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = + MemOp2RegOpTable.find(N->getMachineOpcode()); + if (I == MemOp2RegOpTable.end()) + return false; + unsigned Opc = I->second.first; + unsigned Index = I->second.second & TB_INDEX_MASK; + bool FoldedLoad = I->second.second & TB_FOLDED_LOAD; + bool FoldedStore = I->second.second & TB_FOLDED_STORE; + const MCInstrDesc &MCID = get(Opc); + const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI); + unsigned NumDefs = MCID.NumDefs; + std::vector<SDValue> AddrOps; + std::vector<SDValue> BeforeOps; + std::vector<SDValue> AfterOps; + DebugLoc dl = N->getDebugLoc(); + unsigned NumOps = N->getNumOperands(); + for (unsigned i = 0; i != NumOps-1; ++i) { + SDValue Op = N->getOperand(i); + if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands) + AddrOps.push_back(Op); + else if (i < Index-NumDefs) + BeforeOps.push_back(Op); + else if (i > Index-NumDefs) + AfterOps.push_back(Op); + } + SDValue Chain = N->getOperand(NumOps-1); + AddrOps.push_back(Chain); + + // Emit the load instruction. + SDNode *Load = 0; + MachineFunction &MF = DAG.getMachineFunction(); + if (FoldedLoad) { + EVT VT = *RC->vt_begin(); + std::pair<MachineInstr::mmo_iterator, + MachineInstr::mmo_iterator> MMOs = + MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(), + cast<MachineSDNode>(N)->memoperands_end()); + if (!(*MMOs.first) && + RC == &X86::VR128RegClass && + !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast()) + // Do not introduce a slow unaligned load. + return false; + unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + bool isAligned = (*MMOs.first) && + (*MMOs.first)->getAlignment() >= Alignment; + Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl, + VT, MVT::Other, &AddrOps[0], AddrOps.size()); + NewNodes.push_back(Load); + + // Preserve memory reference information. + cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second); + } + + // Emit the data processing instruction. + std::vector<EVT> VTs; + const TargetRegisterClass *DstRC = 0; + if (MCID.getNumDefs() > 0) { + DstRC = getRegClass(MCID, 0, &RI); + VTs.push_back(*DstRC->vt_begin()); + } + for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { + EVT VT = N->getValueType(i); + if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs()) + VTs.push_back(VT); + } + if (Load) + BeforeOps.push_back(SDValue(Load, 0)); + std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps)); + SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, &BeforeOps[0], + BeforeOps.size()); + NewNodes.push_back(NewNode); + + // Emit the store instruction. + if (FoldedStore) { + AddrOps.pop_back(); + AddrOps.push_back(SDValue(NewNode, 0)); + AddrOps.push_back(Chain); + std::pair<MachineInstr::mmo_iterator, + MachineInstr::mmo_iterator> MMOs = + MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(), + cast<MachineSDNode>(N)->memoperands_end()); + if (!(*MMOs.first) && + RC == &X86::VR128RegClass && + !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast()) + // Do not introduce a slow unaligned store. + return false; + unsigned Alignment = RC->getSize() == 32 ? 32 : 16; + bool isAligned = (*MMOs.first) && + (*MMOs.first)->getAlignment() >= Alignment; + SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC, + isAligned, TM), + dl, MVT::Other, + &AddrOps[0], AddrOps.size()); + NewNodes.push_back(Store); + + // Preserve memory reference information. + cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second); + } + + return true; +} + +unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex) const { + DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = + MemOp2RegOpTable.find(Opc); + if (I == MemOp2RegOpTable.end()) + return 0; + bool FoldedLoad = I->second.second & TB_FOLDED_LOAD; + bool FoldedStore = I->second.second & TB_FOLDED_STORE; + if (UnfoldLoad && !FoldedLoad) + return 0; + if (UnfoldStore && !FoldedStore) + return 0; + if (LoadRegIndex) + *LoadRegIndex = I->second.second & TB_INDEX_MASK; + return I->second.first; +} + +bool +X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, int64_t &Offset2) const { + if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode()) + return false; + unsigned Opc1 = Load1->getMachineOpcode(); + unsigned Opc2 = Load2->getMachineOpcode(); + switch (Opc1) { + default: return false; + case X86::MOV8rm: + case X86::MOV16rm: + case X86::MOV32rm: + case X86::MOV64rm: + case X86::LD_Fp32m: + case X86::LD_Fp64m: + case X86::LD_Fp80m: + case X86::MOVSSrm: + case X86::MOVSDrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + case X86::FsMOVAPSrm: + case X86::FsMOVAPDrm: + case X86::MOVAPSrm: + case X86::MOVUPSrm: + case X86::MOVAPDrm: + case X86::MOVDQArm: + case X86::MOVDQUrm: + // AVX load instructions + case X86::VMOVSSrm: + case X86::VMOVSDrm: + case X86::FsVMOVAPSrm: + case X86::FsVMOVAPDrm: + case X86::VMOVAPSrm: + case X86::VMOVUPSrm: + case X86::VMOVAPDrm: + case X86::VMOVDQArm: + case X86::VMOVDQUrm: + case X86::VMOVAPSYrm: + case X86::VMOVUPSYrm: + case X86::VMOVAPDYrm: + case X86::VMOVDQAYrm: + case X86::VMOVDQUYrm: + break; + } + switch (Opc2) { + default: return false; + case X86::MOV8rm: + case X86::MOV16rm: + case X86::MOV32rm: + case X86::MOV64rm: + case X86::LD_Fp32m: + case X86::LD_Fp64m: + case X86::LD_Fp80m: + case X86::MOVSSrm: + case X86::MOVSDrm: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + case X86::FsMOVAPSrm: + case X86::FsMOVAPDrm: + case X86::MOVAPSrm: + case X86::MOVUPSrm: + case X86::MOVAPDrm: + case X86::MOVDQArm: + case X86::MOVDQUrm: + // AVX load instructions + case X86::VMOVSSrm: + case X86::VMOVSDrm: + case X86::FsVMOVAPSrm: + case X86::FsVMOVAPDrm: + case X86::VMOVAPSrm: + case X86::VMOVUPSrm: + case X86::VMOVAPDrm: + case X86::VMOVDQArm: + case X86::VMOVDQUrm: + case X86::VMOVAPSYrm: + case X86::VMOVUPSYrm: + case X86::VMOVAPDYrm: + case X86::VMOVDQAYrm: + case X86::VMOVDQUYrm: + break; + } + + // Check if chain operands and base addresses match. + if (Load1->getOperand(0) != Load2->getOperand(0) || + Load1->getOperand(5) != Load2->getOperand(5)) + return false; + // Segment operands should match as well. + if (Load1->getOperand(4) != Load2->getOperand(4)) + return false; + // Scale should be 1, Index should be Reg0. + if (Load1->getOperand(1) == Load2->getOperand(1) && + Load1->getOperand(2) == Load2->getOperand(2)) { + if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1) + return false; + + // Now let's examine the displacements. + if (isa<ConstantSDNode>(Load1->getOperand(3)) && + isa<ConstantSDNode>(Load2->getOperand(3))) { + Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue(); + Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue(); + return true; + } + } + return false; +} + +bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const { + assert(Offset2 > Offset1); + if ((Offset2 - Offset1) / 8 > 64) + return false; + + unsigned Opc1 = Load1->getMachineOpcode(); + unsigned Opc2 = Load2->getMachineOpcode(); + if (Opc1 != Opc2) + return false; // FIXME: overly conservative? + + switch (Opc1) { + default: break; + case X86::LD_Fp32m: + case X86::LD_Fp64m: + case X86::LD_Fp80m: + case X86::MMX_MOVD64rm: + case X86::MMX_MOVQ64rm: + return false; + } + + EVT VT = Load1->getValueType(0); + switch (VT.getSimpleVT().SimpleTy) { + default: + // XMM registers. In 64-bit mode we can be a bit more aggressive since we + // have 16 of them to play with. + if (TM.getSubtargetImpl()->is64Bit()) { + if (NumLoads >= 3) + return false; + } else if (NumLoads) { + return false; + } + break; + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + case MVT::f32: + case MVT::f64: + if (NumLoads) + return false; + break; + } + + return true; +} + + +bool X86InstrInfo:: +ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { + assert(Cond.size() == 1 && "Invalid X86 branch condition!"); + X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm()); + if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E) + return true; + Cond[0].setImm(GetOppositeBranchCondition(CC)); + return false; +} + +bool X86InstrInfo:: +isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { + // FIXME: Return false for x87 stack register classes for now. We can't + // allow any loads of these registers before FpGet_ST0_80. + return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass || + RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass); +} + +/// getGlobalBaseReg - Return a virtual register initialized with the +/// the global base register value. Output instructions required to +/// initialize the register in the function entry block, if necessary. +/// +/// TODO: Eliminate this and move the code to X86MachineFunctionInfo. +/// +unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const { + assert(!TM.getSubtarget<X86Subtarget>().is64Bit() && + "X86-64 PIC uses RIP relative addressing"); + + X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); + unsigned GlobalBaseReg = X86FI->getGlobalBaseReg(); + if (GlobalBaseReg != 0) + return GlobalBaseReg; + + // Create the register. The code to initialize it is inserted + // later, by the CGBR pass (below). + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + GlobalBaseReg = RegInfo.createVirtualRegister(X86::GR32RegisterClass); + X86FI->setGlobalBaseReg(GlobalBaseReg); + return GlobalBaseReg; +} + +// These are the replaceable SSE instructions. Some of these have Int variants +// that we don't include here. We don't want to replace instructions selected +// by intrinsics. +static const unsigned ReplaceableInstrs[][3] = { + //PackedSingle PackedDouble PackedInt + { X86::MOVAPSmr, X86::MOVAPDmr, X86::MOVDQAmr }, + { X86::MOVAPSrm, X86::MOVAPDrm, X86::MOVDQArm }, + { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr }, + { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr }, + { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm }, + { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr }, + { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm }, + { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr }, + { X86::ANDPSrm, X86::ANDPDrm, X86::PANDrm }, + { X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr }, + { X86::ORPSrm, X86::ORPDrm, X86::PORrm }, + { X86::ORPSrr, X86::ORPDrr, X86::PORrr }, + { X86::XORPSrm, X86::XORPDrm, X86::PXORrm }, + { X86::XORPSrr, X86::XORPDrr, X86::PXORrr }, + // AVX 128-bit support + { X86::VMOVAPSmr, X86::VMOVAPDmr, X86::VMOVDQAmr }, + { X86::VMOVAPSrm, X86::VMOVAPDrm, X86::VMOVDQArm }, + { X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr }, + { X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr }, + { X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm }, + { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr }, + { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm }, + { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr }, + { X86::VANDPSrm, X86::VANDPDrm, X86::VPANDrm }, + { X86::VANDPSrr, X86::VANDPDrr, X86::VPANDrr }, + { X86::VORPSrm, X86::VORPDrm, X86::VPORrm }, + { X86::VORPSrr, X86::VORPDrr, X86::VPORrr }, + { X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm }, + { X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr }, + // AVX 256-bit support + { X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr }, + { X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm }, + { X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr }, + { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr }, + { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm }, + { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr }, +}; + +// FIXME: Some shuffle and unpack instructions have equivalents in different +// domains, but they require a bit more work than just switching opcodes. + +static const unsigned *lookup(unsigned opcode, unsigned domain) { + for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i) + if (ReplaceableInstrs[i][domain-1] == opcode) + return ReplaceableInstrs[i]; + return 0; +} + +std::pair<uint16_t, uint16_t> +X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const { + uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; + return std::make_pair(domain, + domain && lookup(MI->getOpcode(), domain) ? 0xe : 0); +} + +void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { + assert(Domain>0 && Domain<4 && "Invalid execution domain"); + uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; + assert(dom && "Not an SSE instruction"); + const unsigned *table = lookup(MI->getOpcode(), dom); + assert(table && "Cannot change domain"); + MI->setDesc(get(table[Domain-1])); +} + +/// getNoopForMachoTarget - Return the noop instruction to use for a noop. +void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { + NopInst.setOpcode(X86::NOOP); +} + +bool X86InstrInfo::isHighLatencyDef(int opc) const { + switch (opc) { + default: return false; + case X86::DIVSDrm: + case X86::DIVSDrm_Int: + case X86::DIVSDrr: + case X86::DIVSDrr_Int: + case X86::DIVSSrm: + case X86::DIVSSrm_Int: + case X86::DIVSSrr: + case X86::DIVSSrr_Int: + case X86::SQRTPDm: + case X86::SQRTPDm_Int: + case X86::SQRTPDr: + case X86::SQRTPDr_Int: + case X86::SQRTPSm: + case X86::SQRTPSm_Int: + case X86::SQRTPSr: + case X86::SQRTPSr_Int: + case X86::SQRTSDm: + case X86::SQRTSDm_Int: + case X86::SQRTSDr: + case X86::SQRTSDr_Int: + case X86::SQRTSSm: + case X86::SQRTSSm_Int: + case X86::SQRTSSr: + case X86::SQRTSSr_Int: + // AVX instructions with high latency + case X86::VDIVSDrm: + case X86::VDIVSDrm_Int: + case X86::VDIVSDrr: + case X86::VDIVSDrr_Int: + case X86::VDIVSSrm: + case X86::VDIVSSrm_Int: + case X86::VDIVSSrr: + case X86::VDIVSSrr_Int: + case X86::VSQRTPDm: + case X86::VSQRTPDm_Int: + case X86::VSQRTPDr: + case X86::VSQRTPDr_Int: + case X86::VSQRTPSm: + case X86::VSQRTPSm_Int: + case X86::VSQRTPSr: + case X86::VSQRTPSr_Int: + case X86::VSQRTSDm: + case X86::VSQRTSDm_Int: + case X86::VSQRTSDr: + case X86::VSQRTSSm: + case X86::VSQRTSSm_Int: + case X86::VSQRTSSr: + return true; + } +} + +bool X86InstrInfo:: +hasHighOperandLatency(const InstrItineraryData *ItinData, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const { + return isHighLatencyDef(DefMI->getOpcode()); +} + +namespace { + /// CGBR - Create Global Base Reg pass. This initializes the PIC + /// global base register for x86-32. + struct CGBR : public MachineFunctionPass { + static char ID; + CGBR() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF) { + const X86TargetMachine *TM = + static_cast<const X86TargetMachine *>(&MF.getTarget()); + + assert(!TM->getSubtarget<X86Subtarget>().is64Bit() && + "X86-64 PIC uses RIP relative addressing"); + + // Only emit a global base reg in PIC mode. + if (TM->getRelocationModel() != Reloc::PIC_) + return false; + + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + unsigned GlobalBaseReg = X86FI->getGlobalBaseReg(); + + // If we didn't need a GlobalBaseReg, don't insert code. + if (GlobalBaseReg == 0) + return false; + + // Insert the set of GlobalBaseReg into the first MBB of the function + MachineBasicBlock &FirstMBB = MF.front(); + MachineBasicBlock::iterator MBBI = FirstMBB.begin(); + DebugLoc DL = FirstMBB.findDebugLoc(MBBI); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + const X86InstrInfo *TII = TM->getInstrInfo(); + + unsigned PC; + if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) + PC = RegInfo.createVirtualRegister(X86::GR32RegisterClass); + else + PC = GlobalBaseReg; + + // Operand of MovePCtoStack is completely ignored by asm printer. It's + // only used in JIT code emission as displacement to pc. + BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0); + + // If we're using vanilla 'GOT' PIC style, we should use relative addressing + // not to pc, but to _GLOBAL_OFFSET_TABLE_ external. + if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) { + // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register + BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg) + .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_", + X86II::MO_GOT_ABSOLUTE_ADDRESS); + } + + return true; + } + + virtual const char *getPassName() const { + return "X86 PIC Global Base Reg Initialization"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + }; +} + +char CGBR::ID = 0; +FunctionPass* +llvm::createGlobalBaseRegPass() { return new CGBR(); } diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h new file mode 100644 index 0000000..97009db --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h @@ -0,0 +1,375 @@ +//===- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*- ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86INSTRUCTIONINFO_H +#define X86INSTRUCTIONINFO_H + +#include "llvm/Target/TargetInstrInfo.h" +#include "X86.h" +#include "X86RegisterInfo.h" +#include "llvm/ADT/DenseMap.h" + +#define GET_INSTRINFO_HEADER +#include "X86GenInstrInfo.inc" + +namespace llvm { + class X86RegisterInfo; + class X86TargetMachine; + +namespace X86 { + // X86 specific condition code. These correspond to X86_*_COND in + // X86InstrInfo.td. They must be kept in synch. + enum CondCode { + COND_A = 0, + COND_AE = 1, + COND_B = 2, + COND_BE = 3, + COND_E = 4, + COND_G = 5, + COND_GE = 6, + COND_L = 7, + COND_LE = 8, + COND_NE = 9, + COND_NO = 10, + COND_NP = 11, + COND_NS = 12, + COND_O = 13, + COND_P = 14, + COND_S = 15, + + // Artificial condition codes. These are used by AnalyzeBranch + // to indicate a block terminated with two conditional branches to + // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE, + // which can't be represented on x86 with a single condition. These + // are never used in MachineInstrs. + COND_NE_OR_P, + COND_NP_OR_E, + + COND_INVALID + }; + + // Turn condition code into conditional branch opcode. + unsigned GetCondBranchFromCond(CondCode CC); + + /// GetOppositeBranchCondition - Return the inverse of the specified cond, + /// e.g. turning COND_E to COND_NE. + CondCode GetOppositeBranchCondition(X86::CondCode CC); +} // end namespace X86; + + +/// isGlobalStubReference - Return true if the specified TargetFlag operand is +/// a reference to a stub for a global, not the global itself. +inline static bool isGlobalStubReference(unsigned char TargetFlag) { + switch (TargetFlag) { + case X86II::MO_DLLIMPORT: // dllimport stub. + case X86II::MO_GOTPCREL: // rip-relative GOT reference. + case X86II::MO_GOT: // normal GOT reference. + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref. + case X86II::MO_DARWIN_NONLAZY: // Normal $non_lazy_ptr ref. + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Hidden $non_lazy_ptr ref. + return true; + default: + return false; + } +} + +/// isGlobalRelativeToPICBase - Return true if the specified global value +/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg). If this +/// is true, the addressing mode has the PIC base register added in (e.g. EBX). +inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) { + switch (TargetFlag) { + case X86II::MO_GOTOFF: // isPICStyleGOT: local global. + case X86II::MO_GOT: // isPICStyleGOT: other global. + case X86II::MO_PIC_BASE_OFFSET: // Darwin local global. + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Darwin/32 external global. + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Darwin/32 hidden global. + case X86II::MO_TLVP: // ??? Pretty sure.. + return true; + default: + return false; + } +} + +inline static bool isScale(const MachineOperand &MO) { + return MO.isImm() && + (MO.getImm() == 1 || MO.getImm() == 2 || + MO.getImm() == 4 || MO.getImm() == 8); +} + +inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) { + if (MI->getOperand(Op).isFI()) return true; + return Op+4 <= MI->getNumOperands() && + MI->getOperand(Op ).isReg() && isScale(MI->getOperand(Op+1)) && + MI->getOperand(Op+2).isReg() && + (MI->getOperand(Op+3).isImm() || + MI->getOperand(Op+3).isGlobal() || + MI->getOperand(Op+3).isCPI() || + MI->getOperand(Op+3).isJTI()); +} + +inline static bool isMem(const MachineInstr *MI, unsigned Op) { + if (MI->getOperand(Op).isFI()) return true; + return Op+5 <= MI->getNumOperands() && + MI->getOperand(Op+4).isReg() && + isLeaMem(MI, Op); +} + +class X86InstrInfo : public X86GenInstrInfo { + X86TargetMachine &TM; + const X86RegisterInfo RI; + + /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1, + /// RegOp2MemOpTable2 - Load / store folding opcode maps. + /// + typedef DenseMap<unsigned, + std::pair<unsigned, unsigned> > RegOp2MemOpTableType; + RegOp2MemOpTableType RegOp2MemOpTable2Addr; + RegOp2MemOpTableType RegOp2MemOpTable0; + RegOp2MemOpTableType RegOp2MemOpTable1; + RegOp2MemOpTableType RegOp2MemOpTable2; + + /// MemOp2RegOpTable - Load / store unfolding opcode map. + /// + typedef DenseMap<unsigned, + std::pair<unsigned, unsigned> > MemOp2RegOpTableType; + MemOp2RegOpTableType MemOp2RegOpTable; + + void AddTableEntry(RegOp2MemOpTableType &R2MTable, + MemOp2RegOpTableType &M2RTable, + unsigned RegOp, unsigned MemOp, unsigned Flags); + +public: + explicit X86InstrInfo(X86TargetMachine &tm); + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + /// + virtual const X86RegisterInfo &getRegisterInfo() const { return RI; } + + /// isCoalescableExtInstr - Return true if the instruction is a "coalescable" + /// extension instruction. That is, it's like a copy where it's legal for the + /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns + /// true, then it's expected the pre-extension value is available as a subreg + /// of the result register. This also returns the sub-register index in + /// SubIdx. + virtual bool isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const; + + unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; + /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination + /// stack locations as well. This uses a heuristic so it isn't + /// reliable for correctness. + unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const; + + unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const; + /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination + /// stack locations as well. This uses a heuristic so it isn't + /// reliable for correctness. + unsigned isStoreToStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const; + + bool isReallyTriviallyReMaterializable(const MachineInstr *MI, + AliasAnalysis *AA) const; + void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig, + const TargetRegisterInfo &TRI) const; + + /// convertToThreeAddress - This method must be implemented by targets that + /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target + /// may be able to convert a two-address instruction into a true + /// three-address instruction on demand. This allows the X86 target (for + /// example) to convert ADD and SHL instructions into LEA instructions if they + /// would require register copies due to two-addressness. + /// + /// This method returns a null pointer if the transformation cannot be + /// performed, otherwise it returns the new instruction. + /// + virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const; + + /// commuteInstruction - We have a few instructions that must be hacked on to + /// commute them. + /// + virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const; + + // Branch analysis. + virtual bool isUnpredicatedTerminator(const MachineInstr* MI) const; + virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const; + virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const; + virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const; + virtual void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const; + virtual void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + + virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill, + SmallVectorImpl<MachineOperand> &Addr, + const TargetRegisterClass *RC, + MachineInstr::mmo_iterator MMOBegin, + MachineInstr::mmo_iterator MMOEnd, + SmallVectorImpl<MachineInstr*> &NewMIs) const; + + virtual void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + + virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg, + SmallVectorImpl<MachineOperand> &Addr, + const TargetRegisterClass *RC, + MachineInstr::mmo_iterator MMOBegin, + MachineInstr::mmo_iterator MMOEnd, + SmallVectorImpl<MachineInstr*> &NewMIs) const; + + virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; + + virtual + MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, + int FrameIx, uint64_t Offset, + const MDNode *MDPtr, + DebugLoc DL) const; + + /// foldMemoryOperand - If this target supports it, fold a load or store of + /// the specified stack slot into the specified machine instruction for the + /// specified operand(s). If this is possible, the target should perform the + /// folding and return true, otherwise it should return false. If it folds + /// the instruction, it is likely that the MachineInstruction the iterator + /// references has been changed. + virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr* MI, + const SmallVectorImpl<unsigned> &Ops, + int FrameIndex) const; + + /// foldMemoryOperand - Same as the previous version except it allows folding + /// of any load and store from / to any address, not just from a specific + /// stack slot. + virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr* MI, + const SmallVectorImpl<unsigned> &Ops, + MachineInstr* LoadMI) const; + + /// canFoldMemoryOperand - Returns true if the specified load / store is + /// folding is possible. + virtual bool canFoldMemoryOperand(const MachineInstr*, + const SmallVectorImpl<unsigned> &) const; + + /// unfoldMemoryOperand - Separate a single instruction which folded a load or + /// a store or a load and a store into two or more instruction. If this is + /// possible, returns true as well as the new instructions by reference. + virtual bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, bool UnfoldStore, + SmallVectorImpl<MachineInstr*> &NewMIs) const; + + virtual bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl<SDNode*> &NewNodes) const; + + /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new + /// instruction after load / store are unfolded from an instruction of the + /// specified opcode. It returns zero if the specified unfolding is not + /// possible. If LoadRegIndex is non-null, it is filled in with the operand + /// index of the operand which will hold the register holding the loaded + /// value. + virtual unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex = 0) const; + + /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler + /// to determine if two loads are loading from the same base address. It + /// should only return true if the base pointers are the same and the + /// only differences between the two addresses are the offset. It also returns + /// the offsets by reference. + virtual bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, int64_t &Offset2) const; + + /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to + /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should + /// be scheduled togther. On some targets if two loads are loading from + /// addresses in the same cache line, it's better if they are scheduled + /// together. This function takes two integers that represent the load offsets + /// from the common base address. It returns true if it decides it's desirable + /// to schedule the two loads together. "NumLoads" is the number of loads that + /// have already been scheduled after Load1. + virtual bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const; + + virtual void getNoopForMachoTarget(MCInst &NopInst) const; + + virtual + bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const; + + /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine + /// instruction that defines the specified register class. + bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const; + + static bool isX86_64ExtendedReg(const MachineOperand &MO) { + if (!MO.isReg()) return false; + return X86II::isX86_64ExtendedReg(MO.getReg()); + } + + /// getGlobalBaseReg - Return a virtual register initialized with the + /// the global base register value. Output instructions required to + /// initialize the register in the function entry block, if necessary. + /// + unsigned getGlobalBaseReg(MachineFunction *MF) const; + + std::pair<uint16_t, uint16_t> + getExecutionDomain(const MachineInstr *MI) const; + + void setExecutionDomain(MachineInstr *MI, unsigned Domain) const; + + MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr* MI, + unsigned OpNum, + const SmallVectorImpl<MachineOperand> &MOs, + unsigned Size, unsigned Alignment) const; + + bool isHighLatencyDef(int opc) const; + + bool hasHighOperandLatency(const InstrItineraryData *ItinData, + const MachineRegisterInfo *MRI, + const MachineInstr *DefMI, unsigned DefIdx, + const MachineInstr *UseMI, unsigned UseIdx) const; + +private: + MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc, + MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const; + + /// isFrameOperand - Return true and the FrameIndex if the specified + /// operand and follow operands form a reference to the stack frame. + bool isFrameOperand(const MachineInstr *MI, unsigned int Op, + int &FrameIndex) const; +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td new file mode 100644 index 0000000..d54bf27 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td @@ -0,0 +1,1799 @@ +//===- X86InstrInfo.td - Main X86 Instruction Definition ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 instruction set, defining the instructions, and +// properties of the instructions which are needed for code generation, machine +// code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// X86 specific DAG Nodes. +// + +def SDTIntShiftDOp: SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisInt<3>]>; + +def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>; + +def SDTX86Cmpsd : SDTypeProfile<1, 3, [SDTCisVT<0, f64>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; + +def SDTX86Cmov : SDTypeProfile<1, 4, + [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; + +// Unary and binary operator instructions that set EFLAGS as a side-effect. +def SDTUnaryArithWithFlags : SDTypeProfile<2, 1, + [SDTCisInt<0>, SDTCisVT<1, i32>]>; + +def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; + +// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS +def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, + SDTCisVT<1, i32>, + SDTCisVT<4, i32>]>; +// RES1, RES2, FLAGS = op LHS, RHS +def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2, + [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; +def SDTX86BrCond : SDTypeProfile<0, 3, + [SDTCisVT<0, OtherVT>, + SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; + +def SDTX86SetCC : SDTypeProfile<1, 2, + [SDTCisVT<0, i8>, + SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; +def SDTX86SetCC_C : SDTypeProfile<1, 2, + [SDTCisInt<0>, + SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; + +def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, + SDTCisVT<2, i8>]>; +def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; + +def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>; +def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>; + +def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; +def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; + +def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>; + +def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>, + SDTCisVT<1, iPTR>, + SDTCisVT<2, iPTR>]>; + +def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, i32>, + SDTCisVT<3, i8>, + SDTCisVT<4, i32>]>; + +def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>; + +def SDTX86Void : SDTypeProfile<0, 0, []>; + +def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; + +def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; + +def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; + +def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>; +def SDT_X86MEMBARRIERNoSSE : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER, + [SDNPHasChain]>; +def X86MemBarrierNoSSE : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIERNoSSE, + [SDNPHasChain]>; +def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER, + [SDNPHasChain]>; +def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER, + [SDNPHasChain]>; +def X86LFence : SDNode<"X86ISD::LFENCE", SDT_X86MEMBARRIER, + [SDNPHasChain]>; + + +def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>; +def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>; +def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>; +def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>; + +def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>; +def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>; + +def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>; +def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond, + [SDNPHasChain]>; +def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>; +def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>; + +def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; + +def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86AtomSub64 : SDNode<"X86ISD::ATOMSUB64_DAG", SDTX86atomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86AtomOr64 : SDNode<"X86ISD::ATOMOR64_DAG", SDTX86atomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86AtomXor64 : SDNode<"X86ISD::ATOMXOR64_DAG", SDTX86atomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86AtomAnd64 : SDNode<"X86ISD::ATOMAND64_DAG", SDTX86atomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86AtomNand64 : SDNode<"X86ISD::ATOMNAND64_DAG", SDTX86atomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary, + [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def X86vastart_save_xmm_regs : + SDNode<"X86ISD::VASTART_SAVE_XMM_REGS", + SDT_X86VASTART_SAVE_XMM_REGS, + [SDNPHasChain, SDNPVariadic]>; +def X86vaarg64 : + SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, + SDNPMemOperand]>; +def X86callseq_start : + SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart, + [SDNPHasChain, SDNPOutGlue]>; +def X86callseq_end : + SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86call : SDNode<"X86ISD::CALL", SDT_X86Call, + [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, + SDNPVariadic]>; + +def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>; +def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad]>; + +def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; + +def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; +def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; + +def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET, + [SDNPHasChain]>; + +def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def X86add_flag : SDNode<"X86ISD::ADD", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86sub_flag : SDNode<"X86ISD::SUB", SDTBinaryArithWithFlags>; +def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags, + [SDNPCommutative]>; +def X86adc_flag : SDNode<"X86ISD::ADC", SDTBinaryArithWithFlagsInOut>; +def X86sbb_flag : SDNode<"X86ISD::SBB", SDTBinaryArithWithFlagsInOut>; + +def X86inc_flag : SDNode<"X86ISD::INC", SDTUnaryArithWithFlags>; +def X86dec_flag : SDNode<"X86ISD::DEC", SDTUnaryArithWithFlags>; +def X86or_flag : SDNode<"X86ISD::OR", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86andn_flag : SDNode<"X86ISD::ANDN", SDTBinaryArithWithFlags>; + +def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; + +def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDTX86Void, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; + +def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA, + [SDNPHasChain]>; + +def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +//===----------------------------------------------------------------------===// +// X86 Operand Definitions. +// + +// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for +// the index operand of an address, to conform to x86 encoding restrictions. +def ptr_rc_nosp : PointerLikeRegClass<1>; + +// *mem - Operand definitions for the funky X86 addressing mode operands. +// +def X86MemAsmOperand : AsmOperandClass { + let Name = "Mem"; + let SuperClasses = []; +} +def X86AbsMemAsmOperand : AsmOperandClass { + let Name = "AbsMem"; + let SuperClasses = [X86MemAsmOperand]; +} +class X86MemOperand<string printMethod> : Operand<iPTR> { + let PrintMethod = printMethod; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm); + let ParserMatchClass = X86MemAsmOperand; +} + +let OperandType = "OPERAND_MEMORY" in { +def opaque32mem : X86MemOperand<"printopaquemem">; +def opaque48mem : X86MemOperand<"printopaquemem">; +def opaque80mem : X86MemOperand<"printopaquemem">; +def opaque512mem : X86MemOperand<"printopaquemem">; + +def i8mem : X86MemOperand<"printi8mem">; +def i16mem : X86MemOperand<"printi16mem">; +def i32mem : X86MemOperand<"printi32mem">; +def i64mem : X86MemOperand<"printi64mem">; +def i128mem : X86MemOperand<"printi128mem">; +def i256mem : X86MemOperand<"printi256mem">; +def f32mem : X86MemOperand<"printf32mem">; +def f64mem : X86MemOperand<"printf64mem">; +def f80mem : X86MemOperand<"printf80mem">; +def f128mem : X86MemOperand<"printf128mem">; +def f256mem : X86MemOperand<"printf256mem">; +} + +// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of +// plain GR64, so that it doesn't potentially require a REX prefix. +def i8mem_NOREX : Operand<i64> { + let PrintMethod = "printi8mem"; + let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX_NOSP, i32imm, i8imm); + let ParserMatchClass = X86MemAsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +// GPRs available for tailcall. +// It represents GR64_TC or GR64_TCW64. +def ptr_rc_tailcall : PointerLikeRegClass<2>; + +// Special i32mem for addresses of load folding tail calls. These are not +// allowed to use callee-saved registers since they must be scheduled +// after callee-saved register are popped. +def i32mem_TC : Operand<i32> { + let PrintMethod = "printi32mem"; + let MIOperandInfo = (ops GR32_TC, i8imm, GR32_TC, i32imm, i8imm); + let ParserMatchClass = X86MemAsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +// Special i64mem for addresses of load folding tail calls. These are not +// allowed to use callee-saved registers since they must be scheduled +// after callee-saved register are popped. +def i64mem_TC : Operand<i64> { + let PrintMethod = "printi64mem"; + let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, + ptr_rc_tailcall, i32imm, i8imm); + let ParserMatchClass = X86MemAsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +let OperandType = "OPERAND_PCREL", + ParserMatchClass = X86AbsMemAsmOperand, + PrintMethod = "print_pcrel_imm" in { +def i32imm_pcrel : Operand<i32>; +def i16imm_pcrel : Operand<i16>; + +def offset8 : Operand<i64>; +def offset16 : Operand<i64>; +def offset32 : Operand<i64>; +def offset64 : Operand<i64>; + +// Branch targets have OtherVT type and print as pc-relative values. +def brtarget : Operand<OtherVT>; +def brtarget8 : Operand<OtherVT>; + +} + +def SSECC : Operand<i8> { + let PrintMethod = "printSSECC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +class ImmSExtAsmOperandClass : AsmOperandClass { + let SuperClasses = [ImmAsmOperand]; + let RenderMethod = "addImmOperands"; +} + +class ImmZExtAsmOperandClass : AsmOperandClass { + let SuperClasses = [ImmAsmOperand]; + let RenderMethod = "addImmOperands"; +} + +// Sign-extended immediate classes. We don't need to define the full lattice +// here because there is no instruction with an ambiguity between ImmSExti64i32 +// and ImmSExti32i8. +// +// The strange ranges come from the fact that the assembler always works with +// 64-bit immediates, but for a 16-bit target value we want to accept both "-1" +// (which will be a -1ULL), and "0xFF" (-1 in 16-bits). + +// [0, 0x7FFFFFFF] | +// [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF] +def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti64i32"; +} + +// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] | +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti16i8"; + let SuperClasses = [ImmSExti64i32AsmOperand]; +} + +// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] | +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti32i8"; +} + +// [0, 0x000000FF] +def ImmZExtu32u8AsmOperand : ImmZExtAsmOperandClass { + let Name = "ImmZExtu32u8"; +} + + +// [0, 0x0000007F] | +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti64i8"; + let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand, + ImmSExti64i32AsmOperand]; +} + +// A couple of more descriptive operand definitions. +// 16-bits but only 8 bits are significant. +def i16i8imm : Operand<i16> { + let ParserMatchClass = ImmSExti16i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} +// 32-bits but only 8 bits are significant. +def i32i8imm : Operand<i32> { + let ParserMatchClass = ImmSExti32i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} +// 32-bits but only 8 bits are significant, and those 8 bits are unsigned. +def u32u8imm : Operand<i32> { + let ParserMatchClass = ImmZExtu32u8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 64-bits but only 32 bits are significant. +def i64i32imm : Operand<i64> { + let ParserMatchClass = ImmSExti64i32AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 64-bits but only 32 bits are significant, and those bits are treated as being +// pc relative. +def i64i32imm_pcrel : Operand<i64> { + let PrintMethod = "print_pcrel_imm"; + let ParserMatchClass = X86AbsMemAsmOperand; + let OperandType = "OPERAND_PCREL"; +} + +// 64-bits but only 8 bits are significant. +def i64i8imm : Operand<i64> { + let ParserMatchClass = ImmSExti64i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def lea64_32mem : Operand<i32> { + let PrintMethod = "printi32mem"; + let AsmOperandLowerMethod = "lower_lea64_32mem"; + let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm, i8imm); + let ParserMatchClass = X86MemAsmOperand; +} + + +//===----------------------------------------------------------------------===// +// X86 Complex Pattern Definitions. +// + +// Define X86 specific addressing mode. +def addr : ComplexPattern<iPTR, 5, "SelectAddr", [], [SDNPWantParent]>; +def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr", + [add, sub, mul, X86mul_imm, shl, or, frameindex], + []>; +def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr", + [add, sub, mul, X86mul_imm, shl, or, frameindex, + X86WrapperRIP], []>; + +def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", + [tglobaltlsaddr], []>; + +//===----------------------------------------------------------------------===// +// X86 Instruction Predicate Definitions. +def HasCMov : Predicate<"Subtarget->hasCMov()">; +def NoCMov : Predicate<"!Subtarget->hasCMov()">; + +def HasMMX : Predicate<"Subtarget->hasMMX()">; +def Has3DNow : Predicate<"Subtarget->has3DNow()">; +def Has3DNowA : Predicate<"Subtarget->has3DNowA()">; +def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; +def HasSSE2 : Predicate<"Subtarget->hasSSE2()">; +def HasSSE3 : Predicate<"Subtarget->hasSSE3()">; +def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; +def HasSSE41 : Predicate<"Subtarget->hasSSE41()">; +def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; +def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; + +def HasAVX : Predicate<"Subtarget->hasAVX()">; +def HasXMMInt : Predicate<"Subtarget->hasXMMInt()">; + +def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; +def HasAES : Predicate<"Subtarget->hasAES()">; +def HasCLMUL : Predicate<"Subtarget->hasCLMUL()">; +def HasFMA3 : Predicate<"Subtarget->hasFMA3()">; +def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; +def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">; +def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">; +def HasF16C : Predicate<"Subtarget->hasF16C()">; +def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">; +def HasBMI : Predicate<"Subtarget->hasBMI()">; +def FPStackf32 : Predicate<"!Subtarget->hasXMM()">; +def FPStackf64 : Predicate<"!Subtarget->hasXMMInt()">; +def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; +def In32BitMode : Predicate<"!Subtarget->is64Bit()">, + AssemblerPredicate<"!Mode64Bit">; +def In64BitMode : Predicate<"Subtarget->is64Bit()">, + AssemblerPredicate<"Mode64Bit">; +def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; +def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">; +def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">, + AssemblerPredicate<"ModeNaCl">; +def IsNaCl32 : Predicate<"Subtarget->isTargetNaCl32()">, + AssemblerPredicate<"ModeNaCl,!Mode64Bit">; +def IsNaCl64 : Predicate<"Subtarget->isTargetNaCl64()">, + AssemblerPredicate<"ModeNaCl,Mode64Bit">; +def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">, + AssemblerPredicate<"!ModeNaCl">; +def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">; +def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">; +def FarData : Predicate<"TM.getCodeModel() != CodeModel::Small &&" + "TM.getCodeModel() != CodeModel::Kernel">; +def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" + "TM.getCodeModel() == CodeModel::Kernel">; +def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">; +def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">; +def OptForSize : Predicate<"OptForSize">; +def OptForSpeed : Predicate<"!OptForSize">; +def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; +def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; + +//===----------------------------------------------------------------------===// +// X86 Instruction Format Definitions. +// + +include "X86InstrFormats.td" + +//===----------------------------------------------------------------------===// +// Pattern fragments. +// + +// X86 specific condition code. These correspond to CondCode in +// X86InstrInfo.h. They must be kept in synch. +def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE +def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC +def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C +def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA +def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z +def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE +def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL +def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE +def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG +def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ +def X86_COND_NO : PatLeaf<(i8 10)>; +def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO +def X86_COND_NS : PatLeaf<(i8 12)>; +def X86_COND_O : PatLeaf<(i8 13)>; +def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE +def X86_COND_S : PatLeaf<(i8 15)>; + +let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs. + def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; + def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; + def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>; +} + +def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>; + + +// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit +// unsigned field. +def i64immZExt32 : ImmLeaf<i64, [{ return (uint64_t)Imm == (uint32_t)Imm; }]>; + +def i64immZExt32SExt8 : ImmLeaf<i64, [{ + return (uint64_t)Imm == (uint32_t)Imm && (int32_t)Imm == (int8_t)Imm; +}]>; + +// Helper fragments for loads. +// It's always safe to treat a anyext i16 load as a i32 load if the i16 is +// known to be 32-bit aligned or better. Ditto for i8 to i16. +def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::NON_EXTLOAD) + return true; + if (ExtType == ISD::EXTLOAD) + return LD->getAlignment() >= 2 && !LD->isVolatile(); + return false; +}]>; + +def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::EXTLOAD) + return LD->getAlignment() >= 2 && !LD->isVolatile(); + return false; +}]>; + +def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::NON_EXTLOAD) + return true; + if (ExtType == ISD::EXTLOAD) + return LD->getAlignment() >= 4 && !LD->isVolatile(); + return false; +}]>; + +def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>; +def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; +def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; +def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; +def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; + +def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>; +def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>; +def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>; +def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>; +def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>; +def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>; + +def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>; +def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>; +def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>; +def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>; +def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>; +def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>; +def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>; +def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>; +def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>; +def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>; + +def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>; +def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>; +def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>; +def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>; +def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>; +def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>; +def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>; +def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>; +def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>; +def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>; + + +// An 'and' node with a single use. +def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; +// An 'srl' node with a single use. +def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; +// An 'trunc' node with a single use. +def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{ + return N->hasOneUse(); +}]>; + +//===----------------------------------------------------------------------===// +// Instruction list. +// + +// Nop +let neverHasSideEffects = 1 in { + def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>; + def NOOPW : I<0x1f, MRM0m, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize; + def NOOPL : I<0x1f, MRM0m, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB; +} + + +// Constructing a stack frame. +def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl), + "enter\t$len, $lvl", []>; + +let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in +def LEAVE : I<0xC9, RawFrm, + (outs), (ins), "leave", []>, Requires<[In32BitMode]>; + +let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in +def LEAVE64 : I<0xC9, RawFrm, + (outs), (ins), "leave", []>, Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// + +let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in { +let mayLoad = 1 in { +def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>, + OpSize; +def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>; +def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>, + OpSize; +def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", []>, + OpSize; +def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>; +def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", []>; + +def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", []>, OpSize; +def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", []>, + Requires<[In32BitMode]>; +} + +let mayStore = 1 in { +def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>, + OpSize; +def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>; +def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>, + OpSize; +def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[]>, + OpSize; +def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>; +def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[]>; + +def PUSHi8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm), + "push{l}\t$imm", []>; +def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), + "push{w}\t$imm", []>, OpSize; +def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm), + "push{l}\t$imm", []>; + +def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", []>, OpSize; +def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>, + Requires<[In32BitMode]>; + +} +} + +let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in { +let mayLoad = 1 in { +def POP64r : I<0x58, AddRegFrm, + (outs GR64:$reg), (ins), "pop{q}\t$reg", []>; +def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>; +def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", []>; +} +let mayStore = 1 in { +def PUSH64r : I<0x50, AddRegFrm, + (outs), (ins GR64:$reg), "push{q}\t$reg", []>; +def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", []>; +def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>; +} +} + +let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1 in { +def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm), + "push{q}\t$imm", []>; +def PUSH64i16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), + "push{q}\t$imm", []>; +def PUSH64i32 : Ii32<0x68, RawFrm, (outs), (ins i64i32imm:$imm), + "push{q}\t$imm", []>; +} + +let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in +def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", []>, + Requires<[In64BitMode]>; +let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in +def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", []>, + Requires<[In64BitMode]>; + + + +let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP], + mayLoad=1, neverHasSideEffects=1 in { +def POPA32 : I<0x61, RawFrm, (outs), (ins), "popa{l}", []>, + Requires<[In32BitMode]>; +} +let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], + mayStore=1, neverHasSideEffects=1 in { +def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pusha{l}", []>, + Requires<[In32BitMode]>; +} + +let Constraints = "$src = $dst" in { // GR32 = bswap GR32 +def BSWAP32r : I<0xC8, AddRegFrm, + (outs GR32:$dst), (ins GR32:$src), + "bswap{l}\t$dst", + [(set GR32:$dst, (bswap GR32:$src))]>, TB; + +def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src), + "bswap{q}\t$dst", + [(set GR64:$dst, (bswap GR64:$src))]>, TB; +} // Constraints = "$src = $dst" + +// Bit scan instructions. +let Defs = [EFLAGS] in { +def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "bsf{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>, TB, OpSize; +def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "bsf{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>, TB, + OpSize; +def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "bsf{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>, TB; +def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "bsf{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>, TB; +def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "bsf{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>, TB; +def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "bsf{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>, TB; + +def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "bsr{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>, TB, OpSize; +def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "bsr{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>, TB, + OpSize; +def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "bsr{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>, TB; +def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "bsr{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>, TB; +def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "bsr{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>, TB; +def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "bsr{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>, TB; +} // Defs = [EFLAGS] + + +// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI +let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in { +def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", []>; +def MOVSW : I<0xA5, RawFrm, (outs), (ins), "movsw", []>, OpSize; +def MOVSD : I<0xA5, RawFrm, (outs), (ins), "movs{l|d}", []>; +def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", []>; +} + +// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI +let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in +def STOSB : I<0xAA, RawFrm, (outs), (ins), "stosb", []>; +let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in +def STOSW : I<0xAB, RawFrm, (outs), (ins), "stosw", []>, OpSize; +let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in +def STOSD : I<0xAB, RawFrm, (outs), (ins), "stos{l|d}", []>; +let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in +def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", []>; + +def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scasb", []>; +def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scasw", []>, OpSize; +def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l|d}", []>; +def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", []>; + +def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", []>; +def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", []>, OpSize; +def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", []>; +def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", []>; + + +//===----------------------------------------------------------------------===// +// Move Instructions. +// + +let neverHasSideEffects = 1 in { +def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src), + "mov{b}\t{$src, $dst|$dst, $src}", []>; +def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize; +def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>; +def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>; +} +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { +def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(set GR8:$dst, imm:$src)]>; +def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, imm:$src)]>, OpSize; +def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, imm:$src)]>; +def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src), + "movabs{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, imm:$src)]>; +def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, i64immSExt32:$src)]>; +} + +def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(store (i8 imm:$src), addr:$dst)]>; +def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(store (i16 imm:$src), addr:$dst)]>, OpSize; +def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(store (i32 imm:$src), addr:$dst)]>; +def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(store i64immSExt32:$src, addr:$dst)]>; + +/// moffs8, moffs16 and moffs32 versions of moves. The immediate is a +/// 32-bit offset from the PC. These are only valid in x86-32 mode. +def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src), + "mov{b}\t{$src, %al|AL, $src}", []>, + Requires<[In32BitMode]>; +def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src), + "mov{w}\t{$src, %ax|AL, $src}", []>, OpSize, + Requires<[In32BitMode]>; +def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src), + "mov{l}\t{$src, %eax|EAX, $src}", []>, + Requires<[In32BitMode]>; +def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins), + "mov{b}\t{%al, $dst|$dst, AL}", []>, + Requires<[In32BitMode]>; +def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins), + "mov{w}\t{%ax, $dst|$dst, AL}", []>, OpSize, + Requires<[In32BitMode]>; +def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins), + "mov{l}\t{%eax, $dst|$dst, EAX}", []>, + Requires<[In32BitMode]>; + +// FIXME: These definitions are utterly broken +// Just leave them commented out for now because they're useless outside +// of the large code model, and most compilers won't generate the instructions +// in question. +/* +def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins offset8:$src), + "mov{q}\t{$src, %rax|RAX, $src}", []>; +def MOV64o64a : RIi32<0xA1, RawFrm, (outs), (ins offset64:$src), + "mov{q}\t{$src, %rax|RAX, $src}", []>; +def MOV64ao8 : RIi8<0xA2, RawFrm, (outs offset8:$dst), (ins), + "mov{q}\t{%rax, $dst|$dst, RAX}", []>; +def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins), + "mov{q}\t{%rax, $dst|$dst, RAX}", []>; +*/ + + +let isCodeGenOnly = 1 in { +def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src), + "mov{b}\t{$src, $dst|$dst, $src}", []>; +def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize; +def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>; +def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>; +} + +let canFoldAsLoad = 1, isReMaterializable = 1 in { +def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(set GR8:$dst, (loadi8 addr:$src))]>; +def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (loadi16 addr:$src))]>, OpSize; +def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (loadi32 addr:$src))]>; +def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (load addr:$src))]>; +} + +def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(store GR8:$src, addr:$dst)]>; +def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(store GR16:$src, addr:$dst)]>, OpSize; +def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(store GR32:$src, addr:$dst)]>; +def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(store GR64:$src, addr:$dst)]>; + +// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so +// that they can be used for copying and storing h registers, which can't be +// encoded when a REX prefix is present. +let isCodeGenOnly = 1 in { +let neverHasSideEffects = 1 in +def MOV8rr_NOREX : I<0x88, MRMDestReg, + (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src), + "mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>; +let mayStore = 1 in +def MOV8mr_NOREX : I<0x88, MRMDestMem, + (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src), + "mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>; +let mayLoad = 1, neverHasSideEffects = 1, + canFoldAsLoad = 1, isReMaterializable = 1 in +def MOV8rm_NOREX : I<0x8A, MRMSrcMem, + (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src), + "mov{b}\t{$src, $dst|$dst, $src} # NOREX", []>; +} + + +// Condition code ops, incl. set if equal/not equal/... +let Defs = [EFLAGS], Uses = [AH], neverHasSideEffects = 1 in +def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", []>; // flags = AH +let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in +def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>; // AH = flags + + +//===----------------------------------------------------------------------===// +// Bit tests instructions: BT, BTS, BTR, BTC. + +let Defs = [EFLAGS] in { +def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))]>, OpSize, TB; +def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))]>, TB; +def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))]>, TB; + +// Unlike with the register+register form, the memory+register form of the +// bt instruction does not ignore the high bits of the index. From ISel's +// perspective, this is pretty bizarre. Make these instructions disassembly +// only for now. + +def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", +// [(X86bt (loadi16 addr:$src1), GR16:$src2), +// (implicit EFLAGS)] + [] + >, OpSize, TB, Requires<[FastBTMem]>; +def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", +// [(X86bt (loadi32 addr:$src1), GR32:$src2), +// (implicit EFLAGS)] + [] + >, TB, Requires<[FastBTMem]>; +def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", +// [(X86bt (loadi64 addr:$src1), GR64:$src2), +// (implicit EFLAGS)] + [] + >, TB; + +def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>, + OpSize, TB; +def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))]>, TB; +def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB; + +// Note that these instructions don't need FastBTMem because that +// only applies when the other operand is in a register. When it's +// an immediate, bt is still fast. +def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2)) + ]>, OpSize, TB; +def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2)) + ]>, TB; +def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt (loadi64 addr:$src1), + i64immSExt8:$src2))]>, TB; + + +def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; +def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; +def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; +def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; +def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB; + +def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; +def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; +def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; +def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; +def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; + +def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; +def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; +def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; +def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB; +def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB; +def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +} // Defs = [EFLAGS] + + +//===----------------------------------------------------------------------===// +// Atomic support +// + + +// Atomic swap. These are just normal xchg instructions. But since a memory +// operand is referenced, the atomicity is ensured. +let Constraints = "$val = $dst" in { +def XCHG8rm : I<0x86, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr), + "xchg{b}\t{$val, $ptr|$ptr, $val}", + [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))]>; +def XCHG16rm : I<0x87, MRMSrcMem, (outs GR16:$dst),(ins GR16:$val, i16mem:$ptr), + "xchg{w}\t{$val, $ptr|$ptr, $val}", + [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))]>, + OpSize; +def XCHG32rm : I<0x87, MRMSrcMem, (outs GR32:$dst),(ins GR32:$val, i32mem:$ptr), + "xchg{l}\t{$val, $ptr|$ptr, $val}", + [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))]>; +def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst),(ins GR64:$val,i64mem:$ptr), + "xchg{q}\t{$val, $ptr|$ptr, $val}", + [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))]>; + +def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src), + "xchg{b}\t{$val, $src|$src, $val}", []>; +def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src), + "xchg{w}\t{$val, $src|$src, $val}", []>, OpSize; +def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src), + "xchg{l}\t{$val, $src|$src, $val}", []>; +def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src), + "xchg{q}\t{$val, $src|$src, $val}", []>; +} + +def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src), + "xchg{w}\t{$src, %ax|AX, $src}", []>, OpSize; +def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src), + "xchg{l}\t{$src, %eax|EAX, $src}", []>, Requires<[In32BitMode]>; +// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding. +// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP. +def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src), + "xchg{l}\t{$src, %eax|EAX, $src}", []>, Requires<[In64BitMode]>; +def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src), + "xchg{q}\t{$src, %rax|RAX, $src}", []>; + + + +def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src), + "xadd{b}\t{$src, $dst|$dst, $src}", []>, TB; +def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "xadd{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "xadd{l}\t{$src, $dst|$dst, $src}", []>, TB; +def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "xadd{q}\t{$src, $dst|$dst, $src}", []>, TB; + +let mayLoad = 1, mayStore = 1 in { +def XADD8rm : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src), + "xadd{b}\t{$src, $dst|$dst, $src}", []>, TB; +def XADD16rm : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "xadd{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def XADD32rm : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "xadd{l}\t{$src, $dst|$dst, $src}", []>, TB; +def XADD64rm : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "xadd{q}\t{$src, $dst|$dst, $src}", []>, TB; + +} + +def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src), + "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB; +def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB; +def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB; + +let mayLoad = 1, mayStore = 1 in { +def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src), + "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB; +def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB; +def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB; +} + +let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in +def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst), + "cmpxchg8b\t$dst", []>, TB; + +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in +def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), + "cmpxchg16b\t$dst", []>, TB, Requires<[HasCmpxchg16b]>; + + + +// Lock instruction prefix +def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>; + +// Rex64 instruction prefix +def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>; + +// Data16 instruction prefix +def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>; + +// Repeat string operation instruction prefixes +// These uses the DF flag in the EFLAGS register to inc or dec ECX +let Defs = [ECX], Uses = [ECX,EFLAGS] in { +// Repeat (used with INS, OUTS, MOVS, LODS and STOS) +def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>; +// Repeat while not equal (used with CMPS and SCAS) +def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>; +} + + +// String manipulation instructions +def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", []>; +def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", []>, OpSize; +def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", []>; +def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", []>; + +def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", []>; +def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", []>, OpSize; +def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", []>; + + +// Flag instructions +def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", []>; +def STC : I<0xF9, RawFrm, (outs), (ins), "stc", []>; +def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", []>; +def STI : I<0xFB, RawFrm, (outs), (ins), "sti", []>; +def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", []>; +def STD : I<0xFD, RawFrm, (outs), (ins), "std", []>; +def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", []>; + +def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", []>, TB; + +// Table lookup instructions +def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", []>; + +// ASCII Adjust After Addition +// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS +def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", []>, Requires<[In32BitMode]>; + +// ASCII Adjust AX Before Division +// sets AL, AH and EFLAGS and uses AL and AH +def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src), + "aad\t$src", []>, Requires<[In32BitMode]>; + +// ASCII Adjust AX After Multiply +// sets AL, AH and EFLAGS and uses AL +def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src), + "aam\t$src", []>, Requires<[In32BitMode]>; + +// ASCII Adjust AL After Subtraction - sets +// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS +def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", []>, Requires<[In32BitMode]>; + +// Decimal Adjust AL after Addition +// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS +def DAA : I<0x27, RawFrm, (outs), (ins), "daa", []>, Requires<[In32BitMode]>; + +// Decimal Adjust AL after Subtraction +// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS +def DAS : I<0x2F, RawFrm, (outs), (ins), "das", []>, Requires<[In32BitMode]>; + +// Check Array Index Against Bounds +def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "bound\t{$src, $dst|$dst, $src}", []>, OpSize, + Requires<[In32BitMode]>; +def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "bound\t{$src, $dst|$dst, $src}", []>, + Requires<[In32BitMode]>; + +// Adjust RPL Field of Segment Selector +def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$src), (ins GR16:$dst), + "arpl\t{$src, $dst|$dst, $src}", []>, Requires<[In32BitMode]>; +def ARPL16mr : I<0x63, MRMSrcMem, (outs GR16:$src), (ins i16mem:$dst), + "arpl\t{$src, $dst|$dst, $src}", []>, Requires<[In32BitMode]>; + +//===----------------------------------------------------------------------===// +// MOVBE Instructions +// +let Predicates = [HasMOVBE] in { + def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "movbe{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (bswap (loadi16 addr:$src)))]>, OpSize, T8; + def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "movbe{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bswap (loadi32 addr:$src)))]>, T8; + def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "movbe{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bswap (loadi64 addr:$src)))]>, T8; + def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "movbe{w}\t{$src, $dst|$dst, $src}", + [(store (bswap GR16:$src), addr:$dst)]>, OpSize, T8; + def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "movbe{l}\t{$src, $dst|$dst, $src}", + [(store (bswap GR32:$src), addr:$dst)]>, T8; + def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "movbe{q}\t{$src, $dst|$dst, $src}", + [(store (bswap GR64:$src), addr:$dst)]>, T8; +} + +//===----------------------------------------------------------------------===// +// RDRAND Instruction +// +let Predicates = [HasRDRAND], Defs = [EFLAGS] in { + def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins), + "rdrand{w}\t$dst", []>, OpSize, TB; + def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins), + "rdrand{l}\t$dst", []>, TB; + def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins), + "rdrand{q}\t$dst", []>, TB; +} + +//===----------------------------------------------------------------------===// +// LZCNT Instruction +// +let Predicates = [HasLZCNT], Defs = [EFLAGS] in { + def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "lzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, XS, + OpSize; + def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "lzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctlz (loadi16 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize; + + def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "lzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS; + def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "lzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctlz (loadi32 addr:$src))), + (implicit EFLAGS)]>, XS; + + def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "lzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>, + XS; + def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "lzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctlz (loadi64 addr:$src))), + (implicit EFLAGS)]>, XS; +} + +//===----------------------------------------------------------------------===// +// TZCNT Instruction +// +let Predicates = [HasBMI], Defs = [EFLAGS] in { + def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "tzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, XS, + OpSize; + def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "tzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (cttz (loadi16 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize; + + def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "tzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS; + def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "tzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (cttz (loadi32 addr:$src))), + (implicit EFLAGS)]>, XS; + + def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "tzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>, + XS; + def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "tzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (cttz (loadi64 addr:$src))), + (implicit EFLAGS)]>, XS; +} + +//===----------------------------------------------------------------------===// +// Subsystems. +//===----------------------------------------------------------------------===// + +include "X86InstrArithmetic.td" +include "X86InstrCMovSetCC.td" +include "X86InstrExtension.td" +include "X86InstrControl.td" +include "X86InstrShiftRotate.td" + +// X87 Floating Point Stack. +include "X86InstrFPStack.td" + +// SIMD support (SSE, MMX and AVX) +include "X86InstrFragmentsSIMD.td" + +// FMA - Fused Multiply-Add support (requires FMA) +include "X86InstrFMA.td" + +// SSE, MMX and 3DNow! vector support. +include "X86InstrSSE.td" +include "X86InstrMMX.td" +include "X86Instr3DNow.td" + +include "X86InstrVMX.td" + +// System instructions. +include "X86InstrSystem.td" + +// Compiler Pseudo Instructions and Pat Patterns +include "X86InstrCompiler.td" + +//===----------------------------------------------------------------------===// +// Assembler Mnemonic Aliases +//===----------------------------------------------------------------------===// + +def : MnemonicAlias<"call", "calll">, Requires<[In32BitMode]>; +def : MnemonicAlias<"call", "callq">, Requires<[In64BitMode]>; + +def : MnemonicAlias<"cbw", "cbtw">; +def : MnemonicAlias<"cwd", "cwtd">; +def : MnemonicAlias<"cdq", "cltd">; +def : MnemonicAlias<"cwde", "cwtl">; +def : MnemonicAlias<"cdqe", "cltq">; + +// lret maps to lretl, it is not ambiguous with lretq. +def : MnemonicAlias<"lret", "lretl">; + +def : MnemonicAlias<"leavel", "leave">, Requires<[In32BitMode]>; +def : MnemonicAlias<"leaveq", "leave">, Requires<[In64BitMode]>; + +def : MnemonicAlias<"loopz", "loope">; +def : MnemonicAlias<"loopnz", "loopne">; + +def : MnemonicAlias<"pop", "popl">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pop", "popq">, Requires<[In64BitMode]>; +def : MnemonicAlias<"popf", "popfl">, Requires<[In32BitMode]>; +def : MnemonicAlias<"popf", "popfq">, Requires<[In64BitMode]>; +def : MnemonicAlias<"popfd", "popfl">; + +// FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in +// all modes. However: "push (addr)" and "push $42" should default to +// pushl/pushq depending on the current mode. Similar for "pop %bx" +def : MnemonicAlias<"push", "pushl">, Requires<[In32BitMode]>; +def : MnemonicAlias<"push", "pushq">, Requires<[In64BitMode]>; +def : MnemonicAlias<"pushf", "pushfl">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pushf", "pushfq">, Requires<[In64BitMode]>; +def : MnemonicAlias<"pushfd", "pushfl">; + +def : MnemonicAlias<"repe", "rep">; +def : MnemonicAlias<"repz", "rep">; +def : MnemonicAlias<"repnz", "repne">; + +def : MnemonicAlias<"retl", "ret">, Requires<[In32BitMode]>; +def : MnemonicAlias<"retq", "ret">, Requires<[In64BitMode]>; + +def : MnemonicAlias<"salb", "shlb">; +def : MnemonicAlias<"salw", "shlw">; +def : MnemonicAlias<"sall", "shll">; +def : MnemonicAlias<"salq", "shlq">; + +def : MnemonicAlias<"smovb", "movsb">; +def : MnemonicAlias<"smovw", "movsw">; +def : MnemonicAlias<"smovl", "movsl">; +def : MnemonicAlias<"smovq", "movsq">; + +def : MnemonicAlias<"ud2a", "ud2">; +def : MnemonicAlias<"verrw", "verr">; + +// System instruction aliases. +def : MnemonicAlias<"iret", "iretl">; +def : MnemonicAlias<"sysret", "sysretl">; + +def : MnemonicAlias<"lgdtl", "lgdt">, Requires<[In32BitMode]>; +def : MnemonicAlias<"lgdtq", "lgdt">, Requires<[In64BitMode]>; +def : MnemonicAlias<"lidtl", "lidt">, Requires<[In32BitMode]>; +def : MnemonicAlias<"lidtq", "lidt">, Requires<[In64BitMode]>; +def : MnemonicAlias<"sgdtl", "sgdt">, Requires<[In32BitMode]>; +def : MnemonicAlias<"sgdtq", "sgdt">, Requires<[In64BitMode]>; +def : MnemonicAlias<"sidtl", "sidt">, Requires<[In32BitMode]>; +def : MnemonicAlias<"sidtq", "sidt">, Requires<[In64BitMode]>; + + +// Floating point stack aliases. +def : MnemonicAlias<"fcmovz", "fcmove">; +def : MnemonicAlias<"fcmova", "fcmovnbe">; +def : MnemonicAlias<"fcmovnae", "fcmovb">; +def : MnemonicAlias<"fcmovna", "fcmovbe">; +def : MnemonicAlias<"fcmovae", "fcmovnb">; +def : MnemonicAlias<"fcomip", "fcompi">; +def : MnemonicAlias<"fildq", "fildll">; +def : MnemonicAlias<"fldcww", "fldcw">; +def : MnemonicAlias<"fnstcww", "fnstcw">; +def : MnemonicAlias<"fnstsww", "fnstsw">; +def : MnemonicAlias<"fucomip", "fucompi">; +def : MnemonicAlias<"fwait", "wait">; + + +class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond> + : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix), + !strconcat(Prefix, NewCond, Suffix)>; + +/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of +/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for +/// example "setz" -> "sete". +multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix> { + def C : CondCodeAlias<Prefix, Suffix, "c", "b">; // setc -> setb + def Z : CondCodeAlias<Prefix, Suffix, "z" , "e">; // setz -> sete + def NA : CondCodeAlias<Prefix, Suffix, "na", "be">; // setna -> setbe + def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae">; // setnb -> setae + def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae">; // setnc -> setae + def NG : CondCodeAlias<Prefix, Suffix, "ng", "le">; // setng -> setle + def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge">; // setnl -> setge + def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne">; // setnz -> setne + def PE : CondCodeAlias<Prefix, Suffix, "pe", "p">; // setpe -> setp + def PO : CondCodeAlias<Prefix, Suffix, "po", "np">; // setpo -> setnp + + def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b">; // setnae -> setb + def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a">; // setnbe -> seta + def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l">; // setnge -> setl + def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g">; // setnle -> setg +} + +// Aliases for set<CC> +defm : IntegerCondCodeMnemonicAlias<"set", "">; +// Aliases for j<CC> +defm : IntegerCondCodeMnemonicAlias<"j", "">; +// Aliases for cmov<CC>{w,l,q} +defm : IntegerCondCodeMnemonicAlias<"cmov", "w">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "l">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "q">; + + +//===----------------------------------------------------------------------===// +// Assembler Instruction Aliases +//===----------------------------------------------------------------------===// + +// aad/aam default to base 10 if no operand is specified. +def : InstAlias<"aad", (AAD8i8 10)>; +def : InstAlias<"aam", (AAM8i8 10)>; + +// Disambiguate the mem/imm form of bt-without-a-suffix as btl. +def : InstAlias<"bt $imm, $mem", (BT32mi8 i32mem:$mem, i32i8imm:$imm)>; + +// clr aliases. +def : InstAlias<"clrb $reg", (XOR8rr GR8 :$reg, GR8 :$reg)>; +def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg)>; +def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg)>; +def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg)>; + +// div and idiv aliases for explicit A register. +def : InstAlias<"divb $src, %al", (DIV8r GR8 :$src)>; +def : InstAlias<"divw $src, %ax", (DIV16r GR16:$src)>; +def : InstAlias<"divl $src, %eax", (DIV32r GR32:$src)>; +def : InstAlias<"divq $src, %rax", (DIV64r GR64:$src)>; +def : InstAlias<"divb $src, %al", (DIV8m i8mem :$src)>; +def : InstAlias<"divw $src, %ax", (DIV16m i16mem:$src)>; +def : InstAlias<"divl $src, %eax", (DIV32m i32mem:$src)>; +def : InstAlias<"divq $src, %rax", (DIV64m i64mem:$src)>; +def : InstAlias<"idivb $src, %al", (IDIV8r GR8 :$src)>; +def : InstAlias<"idivw $src, %ax", (IDIV16r GR16:$src)>; +def : InstAlias<"idivl $src, %eax", (IDIV32r GR32:$src)>; +def : InstAlias<"idivq $src, %rax", (IDIV64r GR64:$src)>; +def : InstAlias<"idivb $src, %al", (IDIV8m i8mem :$src)>; +def : InstAlias<"idivw $src, %ax", (IDIV16m i16mem:$src)>; +def : InstAlias<"idivl $src, %eax", (IDIV32m i32mem:$src)>; +def : InstAlias<"idivq $src, %rax", (IDIV64m i64mem:$src)>; + + + +// Various unary fpstack operations default to operating on on ST1. +// For example, "fxch" -> "fxch %st(1)" +def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; +def : InstAlias<"fsubp", (SUBR_FPrST0 ST1)>; +def : InstAlias<"fsubrp", (SUB_FPrST0 ST1)>; +def : InstAlias<"fmulp", (MUL_FPrST0 ST1)>; +def : InstAlias<"fdivp", (DIVR_FPrST0 ST1)>; +def : InstAlias<"fdivrp", (DIV_FPrST0 ST1)>; +def : InstAlias<"fxch", (XCH_F ST1)>; +def : InstAlias<"fcomi", (COM_FIr ST1)>; +def : InstAlias<"fcompi", (COM_FIPr ST1)>; +def : InstAlias<"fucom", (UCOM_Fr ST1)>; +def : InstAlias<"fucomp", (UCOM_FPr ST1)>; +def : InstAlias<"fucomi", (UCOM_FIr ST1)>; +def : InstAlias<"fucompi", (UCOM_FIPr ST1)>; + +// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op. +// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate +// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with +// gas. +multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> { + def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"), + (Inst RST:$op), EmitAlias>; + def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"), + (Inst ST0), EmitAlias>; +} + +defm : FpUnaryAlias<"fadd", ADD_FST0r>; +defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>; +defm : FpUnaryAlias<"fsub", SUB_FST0r>; +defm : FpUnaryAlias<"fsubp", SUBR_FPrST0>; +defm : FpUnaryAlias<"fsubr", SUBR_FST0r>; +defm : FpUnaryAlias<"fsubrp", SUB_FPrST0>; +defm : FpUnaryAlias<"fmul", MUL_FST0r>; +defm : FpUnaryAlias<"fmulp", MUL_FPrST0>; +defm : FpUnaryAlias<"fdiv", DIV_FST0r>; +defm : FpUnaryAlias<"fdivp", DIVR_FPrST0>; +defm : FpUnaryAlias<"fdivr", DIVR_FST0r>; +defm : FpUnaryAlias<"fdivrp", DIV_FPrST0>; +defm : FpUnaryAlias<"fcomi", COM_FIr, 0>; +defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>; +defm : FpUnaryAlias<"fcompi", COM_FIPr>; +defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; + + +// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they +// commute. We also allow fdiv[r]p/fsubrp even though they don't commute, +// solely because gas supports it. +def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>; +def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>; +def : InstAlias<"fsubp %st(0), $op", (SUBR_FPrST0 RST:$op)>; +def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>; +def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>; +def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>; + +// We accept "fnstsw %eax" even though it only writes %ax. +def : InstAlias<"fnstsw %eax", (FNSTSW8r)>; +def : InstAlias<"fnstsw %al" , (FNSTSW8r)>; +def : InstAlias<"fnstsw" , (FNSTSW8r)>; + +// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but +// this is compatible with what GAS does. +def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>; +def : InstAlias<"ljmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>; +def : InstAlias<"lcall *$dst", (FARCALL32m opaque48mem:$dst)>; +def : InstAlias<"ljmp *$dst", (FARJMP32m opaque48mem:$dst)>; + +// "imul <imm>, B" is an alias for "imul <imm>, B, B". +def : InstAlias<"imulw $imm, $r", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm)>; +def : InstAlias<"imulw $imm, $r", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm)>; +def : InstAlias<"imull $imm, $r", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm)>; +def : InstAlias<"imull $imm, $r", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm)>; +def : InstAlias<"imulq $imm, $r",(IMUL64rri32 GR64:$r, GR64:$r,i64i32imm:$imm)>; +def : InstAlias<"imulq $imm, $r", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm)>; + +// inb %dx -> inb %al, %dx +def : InstAlias<"inb %dx", (IN8rr)>; +def : InstAlias<"inw %dx", (IN16rr)>; +def : InstAlias<"inl %dx", (IN32rr)>; +def : InstAlias<"inb $port", (IN8ri i8imm:$port)>; +def : InstAlias<"inw $port", (IN16ri i8imm:$port)>; +def : InstAlias<"inl $port", (IN32ri i8imm:$port)>; + + +// jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp +def : InstAlias<"call $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>; +def : InstAlias<"jmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>; +def : InstAlias<"callw $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>; +def : InstAlias<"jmpw $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>; +def : InstAlias<"calll $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>; +def : InstAlias<"jmpl $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>; + +// Force mov without a suffix with a segment and mem to prefer the 'l' form of +// the move. All segment/mem forms are equivalent, this has the shortest +// encoding. +def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem)>; +def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg)>; + +// Match 'movq <largeimm>, <reg>' as an alias for movabsq. +def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm)>; + +// Match 'movq GR64, MMX' as an alias for movd. +def : InstAlias<"movq $src, $dst", + (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>; +def : InstAlias<"movq $src, $dst", + (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>; + +// movsd with no operands (as opposed to the SSE scalar move of a double) is an +// alias for movsl. (as in rep; movsd) +def : InstAlias<"movsd", (MOVSD)>; + +// movsx aliases +def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; +def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; + +// movzx aliases +def : InstAlias<"movzx $src, $dst", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>; +// Note: No GR32->GR64 movzx form. + +// outb %dx -> outb %al, %dx +def : InstAlias<"outb %dx", (OUT8rr)>; +def : InstAlias<"outw %dx", (OUT16rr)>; +def : InstAlias<"outl %dx", (OUT32rr)>; +def : InstAlias<"outb $port", (OUT8ir i8imm:$port)>; +def : InstAlias<"outw $port", (OUT16ir i8imm:$port)>; +def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>; + +// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same +// effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity +// errors, since its encoding is the most compact. +def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>; + +// shld/shrd op,op -> shld op, op, 1 +def : InstAlias<"shldw $r1, $r2", (SHLD16rri8 GR16:$r1, GR16:$r2, 1)>; +def : InstAlias<"shldl $r1, $r2", (SHLD32rri8 GR32:$r1, GR32:$r2, 1)>; +def : InstAlias<"shldq $r1, $r2", (SHLD64rri8 GR64:$r1, GR64:$r2, 1)>; +def : InstAlias<"shrdw $r1, $r2", (SHRD16rri8 GR16:$r1, GR16:$r2, 1)>; +def : InstAlias<"shrdl $r1, $r2", (SHRD32rri8 GR32:$r1, GR32:$r2, 1)>; +def : InstAlias<"shrdq $r1, $r2", (SHRD64rri8 GR64:$r1, GR64:$r2, 1)>; + +def : InstAlias<"shldw $mem, $reg", (SHLD16mri8 i16mem:$mem, GR16:$reg, 1)>; +def : InstAlias<"shldl $mem, $reg", (SHLD32mri8 i32mem:$mem, GR32:$reg, 1)>; +def : InstAlias<"shldq $mem, $reg", (SHLD64mri8 i64mem:$mem, GR64:$reg, 1)>; +def : InstAlias<"shrdw $mem, $reg", (SHRD16mri8 i16mem:$mem, GR16:$reg, 1)>; +def : InstAlias<"shrdl $mem, $reg", (SHRD32mri8 i32mem:$mem, GR32:$reg, 1)>; +def : InstAlias<"shrdq $mem, $reg", (SHRD64mri8 i64mem:$mem, GR64:$reg, 1)>; + +/* FIXME: This is disabled because the asm matcher is currently incapable of + * matching a fixed immediate like $1. +// "shl X, $1" is an alias for "shl X". +multiclass ShiftRotateByOneAlias<string Mnemonic, string Opc> { + def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "8r1")) GR8:$op)>; + def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "16r1")) GR16:$op)>; + def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "32r1")) GR32:$op)>; + def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "64r1")) GR64:$op)>; + def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "8m1")) i8mem:$op)>; + def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "16m1")) i16mem:$op)>; + def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "32m1")) i32mem:$op)>; + def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "64m1")) i64mem:$op)>; +} + +defm : ShiftRotateByOneAlias<"rcl", "RCL">; +defm : ShiftRotateByOneAlias<"rcr", "RCR">; +defm : ShiftRotateByOneAlias<"rol", "ROL">; +defm : ShiftRotateByOneAlias<"ror", "ROR">; +FIXME */ + +// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms. +def : InstAlias<"testb $val, $mem", (TEST8rm GR8 :$val, i8mem :$mem)>; +def : InstAlias<"testw $val, $mem", (TEST16rm GR16:$val, i16mem:$mem)>; +def : InstAlias<"testl $val, $mem", (TEST32rm GR32:$val, i32mem:$mem)>; +def : InstAlias<"testq $val, $mem", (TEST64rm GR64:$val, i64mem:$mem)>; + +// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms. +def : InstAlias<"xchgb $mem, $val", (XCHG8rm GR8 :$val, i8mem :$mem)>; +def : InstAlias<"xchgw $mem, $val", (XCHG16rm GR16:$val, i16mem:$mem)>; +def : InstAlias<"xchgl $mem, $val", (XCHG32rm GR32:$val, i32mem:$mem)>; +def : InstAlias<"xchgq $mem, $val", (XCHG64rm GR64:$val, i64mem:$mem)>; + +// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms. +def : InstAlias<"xchgw %ax, $src", (XCHG16ar GR16:$src)>; +def : InstAlias<"xchgl %eax, $src", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>; +def : InstAlias<"xchgl %eax, $src", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>; +def : InstAlias<"xchgq %rax, $src", (XCHG64ar GR64:$src)>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td new file mode 100644 index 0000000..b2d9fca --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td @@ -0,0 +1,454 @@ +//====- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 MMX instruction set, defining the instructions, +// and properties of the instructions which are needed for code generation, +// machine code emission, and analysis. +// +// All instructions that use MMX should be in this file, even if they also use +// SSE. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// MMX Multiclasses +//===----------------------------------------------------------------------===// + +let Constraints = "$src1 = $dst" in { + // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic. + // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp. + multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, + bit Commutable = 0> { + def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]> { + let isCommutable = Commutable; + } + def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, i64mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, + (bitconvert (load_mmx addr:$src2))))]>; + } + + multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm, + string OpcodeStr, Intrinsic IntId, + Intrinsic IntId2> { + def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>; + def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, i64mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, + (bitconvert (load_mmx addr:$src2))))]>; + def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst), + (ins VR64:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))]>; + } +} + +/// Unary MMX instructions requiring SSSE3. +multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr, + Intrinsic IntId64> { + def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR64:$dst, (IntId64 VR64:$src))]>; + + def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR64:$dst, + (IntId64 (bitconvert (memopmmx addr:$src))))]>; +} + +/// Binary MMX instructions requiring SSSE3. +let ImmT = NoImm, Constraints = "$src1 = $dst" in { +multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, + Intrinsic IntId64> { + let isCommutable = 0 in + def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]>; + def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, i64mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, + (IntId64 VR64:$src1, + (bitconvert (memopmmx addr:$src2))))]>; +} +} + +/// PALIGN MMX instructions (require SSSE3). +multiclass ssse3_palign_mm<string asm, Intrinsic IntId> { + def R64irr : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2, i8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>; + def R64irm : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, i64mem:$src2, i8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR64:$dst, (IntId VR64:$src1, + (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>; +} + +multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, + string asm, Domain d> { + def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, + [(set DstRC:$dst, (Int SrcRC:$src))], d>; + def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, + [(set DstRC:$dst, (Int (ld_frag addr:$src)))], d>; +} + +multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC, + RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, + PatFrag ld_frag, string asm, Domain d> { + def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst),(ins DstRC:$src1, SrcRC:$src2), + asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], d>; + def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src2), asm, + [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], d>; +} + +//===----------------------------------------------------------------------===// +// MMX EMMS Instruction +//===----------------------------------------------------------------------===// + +def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", + [(int_x86_mmx_emms)]>; + +//===----------------------------------------------------------------------===// +// MMX Scalar Instructions +//===----------------------------------------------------------------------===// + +// Data Transfer Instructions +def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (scalar_to_vector GR32:$src)))]>; +let canFoldAsLoad = 1 in +def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (scalar_to_vector (loadi32 addr:$src))))]>; +let mayStore = 1 in +def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src), + "movd\t{$src, $dst|$dst, $src}", []>; +def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs), (ins GR32:$dst, VR64:$src), + "movd\t{$src, $dst|$dst, $src}", []>; + +let neverHasSideEffects = 1 in +def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), + "movd\t{$src, $dst|$dst, $src}", + []>; + +// These are 64 bit moves, but since the OS X assembler doesn't +// recognize a register-register movq, we write them as +// movd. +def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg, + (outs GR64:$dst), (ins VR64:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, + (bitconvert VR64:$src))]>; +def MMX_MOVD64rrv164 : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (bitconvert GR64:$src))]>; +let neverHasSideEffects = 1 in +def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), + "movq\t{$src, $dst|$dst, $src}", []>; +let canFoldAsLoad = 1 in +def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, (load_mmx addr:$src))]>; +def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (x86mmx VR64:$src), addr:$dst)]>; + +def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src), + "movdq2q\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (bitconvert + (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))))))]>; + +def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src), + "movq2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector + (i64 (bitconvert (x86mmx VR64:$src))))))]>; + +let neverHasSideEffects = 1 in +def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMSrcReg, (outs FR64:$dst), (ins VR64:$src), + "movq2dq\t{$src, $dst|$dst, $src}", []>; + +def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins FR64:$src), + "movdq2q\t{$src, $dst|$dst, $src}", []>; + +def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), + "movntq\t{$src, $dst|$dst, $src}", + [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>; + +let AddedComplexity = 15 in +// movd to MMX register zero-extends +def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))]>; +let AddedComplexity = 20 in +def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), + (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (X86vzmovl (x86mmx + (scalar_to_vector (loadi32 addr:$src))))))]>; + +// Arithmetic Instructions +defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b>; +defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w>; +defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d>; +// -- Addition +defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b, 1>; +defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w, 1>; +defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d, 1>; +defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q, 1>; +defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, 1>; +defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, 1>; + +defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, 1>; +defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, 1>; + +defm MMX_PHADDW : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w>; +defm MMX_PHADD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d>; +defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw>; + + +// -- Subtraction +defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b>; +defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w>; +defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d>; +defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q>; + +defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b>; +defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w>; + +defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b>; +defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w>; + +defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w>; +defm MMX_PHSUBD : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d>; +defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw>; + +// -- Multiplication +defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w, 1>; + +defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w, 1>; +defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, 1>; +defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, 1>; +let isCommutable = 1 in +defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw", + int_x86_ssse3_pmul_hr_sw>; + +// -- Miscellanea +defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, 1>; + +defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw", + int_x86_ssse3_pmadd_ub_sw>; +defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, 1>; +defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, 1>; + +defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, 1>; +defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, 1>; + +defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, 1>; +defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, 1>; + +defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, 1>; + +defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b>; +defm MMX_PSIGNW : SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w>; +defm MMX_PSIGND : SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d>; +let Constraints = "$src1 = $dst" in + defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>; + +// Logical Instructions +defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, 1>; +defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por, 1>; +defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, 1>; +defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn>; + +// Shift Instructions +defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", + int_x86_mmx_psrl_w, int_x86_mmx_psrli_w>; +defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", + int_x86_mmx_psrl_d, int_x86_mmx_psrli_d>; +defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", + int_x86_mmx_psrl_q, int_x86_mmx_psrli_q>; + +defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", + int_x86_mmx_psll_w, int_x86_mmx_pslli_w>; +defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", + int_x86_mmx_psll_d, int_x86_mmx_pslli_d>; +defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", + int_x86_mmx_psll_q, int_x86_mmx_pslli_q>; + +defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", + int_x86_mmx_psra_w, int_x86_mmx_psrai_w>; +defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", + int_x86_mmx_psra_d, int_x86_mmx_psrai_d>; + +// Comparison Instructions +defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>; +defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>; +defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d>; + +defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b>; +defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w>; +defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d>; + +// -- Unpack Instructions +defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", + int_x86_mmx_punpckhbw>; +defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", + int_x86_mmx_punpckhwd>; +defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq", + int_x86_mmx_punpckhdq>; +defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", + int_x86_mmx_punpcklbw>; +defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", + int_x86_mmx_punpcklwd>; +defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq", + int_x86_mmx_punpckldq>; + +// -- Pack Instructions +defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb>; +defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw>; +defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb>; + +// -- Shuffle Instructions +defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b>; + +def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg, + (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2), + "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR64:$dst, + (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))]>; +def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, + (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2), + "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR64:$dst, + (int_x86_sse_pshuf_w (load_mmx addr:$src1), + imm:$src2))]>; + + + + + +// -- Conversion Instructions +defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi, + f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}", + SSEPackedSingle>, TB; +defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi, + f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}", + SSEPackedDouble>, TB, OpSize; +defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi, + f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}", + SSEPackedSingle>, TB; +defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi, + f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}", + SSEPackedDouble>, TB, OpSize; +defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd, + i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}", + SSEPackedDouble>, TB, OpSize; +let Constraints = "$src1 = $dst" in { + defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128, + int_x86_sse_cvtpi2ps, + i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}", + SSEPackedSingle>, TB; +} + +// Extract / Insert +def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg, + (outs GR32:$dst), (ins VR64:$src1, i32i8imm:$src2), + "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (int_x86_mmx_pextr_w VR64:$src1, + (iPTR imm:$src2)))]>; +let Constraints = "$src1 = $dst" in { + def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg, + (outs VR64:$dst), + (ins VR64:$src1, GR32:$src2, i32i8imm:$src3), + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, + GR32:$src2, (iPTR imm:$src3)))]>; + + def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem, + (outs VR64:$dst), + (ins VR64:$src1, i16mem:$src2, i32i8imm:$src3), + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, + (i32 (anyext (loadi16 addr:$src2))), + (iPTR imm:$src3)))]>; +} + +// Mask creation +def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, + (int_x86_mmx_pmovmskb VR64:$src))]>; + + +// MMX to XMM for vector types +def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1, + [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>; + +def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)), + (v2i64 (MMX_MOVQ2DQrr VR64:$src))>; + +def : Pat<(v2i64 (MMX_X86movq2dq (load_mmx addr:$src))), + (v2i64 (MOVQI2PQIrm addr:$src))>; + +def : Pat<(v2i64 (MMX_X86movq2dq + (x86mmx (scalar_to_vector (loadi32 addr:$src))))), + (v2i64 (MOVDI2PDIrm addr:$src))>; + +// Low word of XMM to MMX. +def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1, + [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>; + +def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)), + (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>; + +def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))), + (x86mmx (MMX_MOVQ64rm addr:$src))>; + +// Misc. +let Uses = [EDI] in +def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), + "maskmovq\t{$mask, $src|$src, $mask}", + [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>; +let Uses = [RDI] in +def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), + "maskmovq\t{$mask, $src|$src, $mask}", + [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)]>; + +// 64-bit bit convert. +def : Pat<(x86mmx (bitconvert (i64 GR64:$src))), + (MMX_MOVD64to64rr GR64:$src)>; +def : Pat<(i64 (bitconvert (x86mmx VR64:$src))), + (MMX_MOVD64from64rr VR64:$src)>; +def : Pat<(f64 (bitconvert (x86mmx VR64:$src))), + (MMX_MOVQ2FR64rr VR64:$src)>; +def : Pat<(x86mmx (bitconvert (f64 FR64:$src))), + (MMX_MOVFR642Qrr FR64:$src)>; + + diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td new file mode 100644 index 0000000..d3ced23 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -0,0 +1,6799 @@ +//====- X86InstrSSE.td - Describe the X86 Instruction Set --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 SSE instruction set, defining the instructions, +// and properties of the instructions which are needed for code generation, +// machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 Instructions Classes +//===----------------------------------------------------------------------===// + +/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class +multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, X86MemOperand x86memop, + bit Is2Addr = 1> { + let isCommutable = 1 in { + def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>; + } + def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))]>; +} + +/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class +multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC, + string asm, string SSEVer, string FPSizeStr, + Operand memopr, ComplexPattern mem_cpat, + bit Is2Addr = 1> { + def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr)) + RC:$src1, RC:$src2))]>; + def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse", + SSEVer, "_", OpcodeStr, FPSizeStr)) + RC:$src1, mem_cpat:$src2))]>; +} + +/// sse12_fp_packed - SSE 1 & 2 packed instructions class +multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, ValueType vt, + X86MemOperand x86memop, PatFrag mem_frag, + Domain d, bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>; + let mayLoad = 1 in + def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], d>; +} + +/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class +multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, + string OpcodeStr, X86MemOperand x86memop, + list<dag> pat_rr, list<dag> pat_rm, + bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + pat_rr, d>; + def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + pat_rm, d>; +} + +/// sse12_fp_packed_int - SSE 1 & 2 packed instructions intrinsics class +multiclass sse12_fp_packed_int<bits<8> opc, string OpcodeStr, RegisterClass RC, + string asm, string SSEVer, string FPSizeStr, + X86MemOperand x86memop, PatFrag mem_frag, + Domain d, bit Is2Addr = 1> { + def rr_Int : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr)) + RC:$src1, RC:$src2))], d>; + def rm_Int : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1,x86memop:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr)) + RC:$src1, (mem_frag addr:$src2)))], d>; +} + +//===----------------------------------------------------------------------===// +// Non-instruction patterns +//===----------------------------------------------------------------------===// + +// A vector extract of the first f32/f64 position is a subregister copy +def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), + (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; +def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), + (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>; + +// A 128-bit subvector extract from the first 256-bit vector position +// is a subregister copy that needs no instruction. +def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (i32 0))), + (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>; +def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (i32 0))), + (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>; + +def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (i32 0))), + (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>; +def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (i32 0))), + (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>; + +def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (i32 0))), + (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>; +def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (i32 0))), + (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>; + +// A 128-bit subvector insert to the first 256-bit vector position +// is a subregister copy that needs no instruction. +def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)), + (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)), + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)), + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)), + (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)), + (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; +def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)), + (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; + +// Implicitly promote a 32-bit scalar to a vector. +def : Pat<(v4f32 (scalar_to_vector FR32:$src)), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>; +def : Pat<(v8f32 (scalar_to_vector FR32:$src)), + (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>; +// Implicitly promote a 64-bit scalar to a vector. +def : Pat<(v2f64 (scalar_to_vector FR64:$src)), + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>; +def : Pat<(v4f64 (scalar_to_vector FR64:$src)), + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>; + +// Bitcasts between 128-bit vector types. Return the original type since +// no instruction is needed for the conversion +let Predicates = [HasXMMInt] in { + def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; +} + +// Bitcasts between 256-bit vector types. Return the original type since +// no instruction is needed for the conversion +let Predicates = [HasAVX] in { + def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>; + def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>; + def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>; + def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>; + def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>; + def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>; + def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>; + def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>; + def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>; + def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>; + def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>; + def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>; + def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>; + def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>; + def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>; + def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>; + def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>; + def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>; + def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>; + def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>; + def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>; + def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>; + def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>; + def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>; + def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>; + def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>; + def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>; + def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>; + def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>; + def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>; +} + +// Alias instructions that map fld0 to pxor for sse. +// FIXME: Set encoding to pseudo! +let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1, + canFoldAsLoad = 1 in { + def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "", + [(set FR32:$dst, fp32imm0)]>, + Requires<[HasSSE1]>, TB, OpSize; + def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "", + [(set FR64:$dst, fpimm0)]>, + Requires<[HasSSE2]>, TB, OpSize; + def VFsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "", + [(set FR32:$dst, fp32imm0)]>, + Requires<[HasAVX]>, TB, OpSize, VEX_4V; + def VFsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "", + [(set FR64:$dst, fpimm0)]>, + Requires<[HasAVX]>, TB, OpSize, VEX_4V; +} + +//===----------------------------------------------------------------------===// +// AVX & SSE - Zero/One Vectors +//===----------------------------------------------------------------------===// + +// Alias instruction that maps zero vector to pxor / xorp* for sse. +// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then +// swizzled by ExecutionDepsFix to pxor. +// We set canFoldAsLoad because this can be converted to a constant-pool +// load of an all-zeros value if folding it would be beneficial. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, neverHasSideEffects = 1 in { +def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", []>; +} + +def : Pat<(v4f32 immAllZerosV), (V_SET0)>; +def : Pat<(v2f64 immAllZerosV), (V_SET0)>; +def : Pat<(v4i32 immAllZerosV), (V_SET0)>; +def : Pat<(v2i64 immAllZerosV), (V_SET0)>; +def : Pat<(v8i16 immAllZerosV), (V_SET0)>; +def : Pat<(v16i8 immAllZerosV), (V_SET0)>; + + +// The same as done above but for AVX. The 256-bit ISA does not support PI, +// and doesn't need it because on sandy bridge the register is set to zero +// at the rename stage without using any execution unit, so SET0PSY +// and SET0PDY can be used for vector int instructions without penalty +// FIXME: Change encoding to pseudo! This is blocked right now by the x86 +// JIT implementatioan, it does not expand the instructions below like +// X86MCInstLower does. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isCodeGenOnly = 1, Predicates = [HasAVX] in { +def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V; +def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V; +} + + +// AVX has no support for 256-bit integer instructions, but since the 128-bit +// VPXOR instruction writes zero to its upper part, it's safe build zeros. +def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>; +def : Pat<(bc_v8i32 (v8f32 immAllZerosV)), + (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>; + +def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; +def : Pat<(bc_v4i64 (v8f32 immAllZerosV)), + (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>; + +// We set canFoldAsLoad because this can be converted to a constant-pool +// load of an all-ones value if folding it would be beneficial. +// FIXME: Change encoding to pseudo! This is blocked right now by the x86 +// JIT implementation, it does not expand the instructions below like +// X86MCInstLower does. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isCodeGenOnly = 1, ExeDomain = SSEPackedInt in + def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v4i32 immAllOnesV))]>; +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX] in + def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V; + + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move FP Scalar Instructions +// +// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 +// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr +// is used instead. Register-to-register movss/movsd is not modeled as an +// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable +// in terms of a copy, and just mentioned, we don't use movss/movsd for copies. +//===----------------------------------------------------------------------===// + +class sse12_move_rr<RegisterClass RC, ValueType vt, string asm> : + SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), asm, + [(set (vt VR128:$dst), (movl VR128:$src1, (scalar_to_vector RC:$src2)))]>; + +// Loading from memory automatically zeroing upper bits. +class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, + PatFrag mem_pat, string OpcodeStr> : + SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (mem_pat addr:$src))]>; + +// AVX +def VMOVSSrr : sse12_move_rr<FR32, v4f32, + "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V, + VEX_LIG; +def VMOVSDrr : sse12_move_rr<FR64, v2f64, + "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V, + VEX_LIG; + +// For the disassembler +let isCodeGenOnly = 1 in { + def VMOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src1, FR32:$src2), + "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + XS, VEX_4V, VEX_LIG; + def VMOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src1, FR64:$src2), + "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + XD, VEX_4V, VEX_LIG; +} + +let canFoldAsLoad = 1, isReMaterializable = 1 in { + def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX, + VEX_LIG; + let AddedComplexity = 20 in + def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX, + VEX_LIG; +} + +def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src), + "movss\t{$src, $dst|$dst, $src}", + [(store FR32:$src, addr:$dst)]>, XS, VEX, VEX_LIG; +def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), + "movsd\t{$src, $dst|$dst, $src}", + [(store FR64:$src, addr:$dst)]>, XD, VEX, VEX_LIG; + +// SSE1 & 2 +let Constraints = "$src1 = $dst" in { + def MOVSSrr : sse12_move_rr<FR32, v4f32, + "movss\t{$src2, $dst|$dst, $src2}">, XS; + def MOVSDrr : sse12_move_rr<FR64, v2f64, + "movsd\t{$src2, $dst|$dst, $src2}">, XD; + + // For the disassembler + let isCodeGenOnly = 1 in { + def MOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src1, FR32:$src2), + "movss\t{$src2, $dst|$dst, $src2}", []>, XS; + def MOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src1, FR64:$src2), + "movsd\t{$src2, $dst|$dst, $src2}", []>, XD; + } +} + +let canFoldAsLoad = 1, isReMaterializable = 1 in { + def MOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS; + + let AddedComplexity = 20 in + def MOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD; +} + +def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src), + "movss\t{$src, $dst|$dst, $src}", + [(store FR32:$src, addr:$dst)]>; +def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), + "movsd\t{$src, $dst|$dst, $src}", + [(store FR64:$src, addr:$dst)]>; + +// Patterns +let Predicates = [HasSSE1] in { + let AddedComplexity = 15 in { + // Extract the low 32-bit value from one vector and insert it into another. + def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)), + (MOVSSrr (v4f32 VR128:$src1), + (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; + def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)), + (MOVSSrr (v4i32 VR128:$src1), + (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>; + + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVSS to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), + (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (MOVSSrr (v4f32 (V_SET0)), + (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (MOVSSrr (v4i32 (V_SET0)), + (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>; + } + + let AddedComplexity = 20 in { + // MOVSSrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>; + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>; + def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), + (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>; + } + + // Extract and store. + def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), + addr:$dst), + (MOVSSmr addr:$dst, + (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; + + // Shuffle with MOVSS + def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))), + (MOVSSrr VR128:$src1, FR32:$src2)>; + def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), + (MOVSSrr (v4i32 VR128:$src1), + (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>; + def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), + (MOVSSrr (v4f32 VR128:$src1), + (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; +} + +let Predicates = [HasSSE2] in { + let AddedComplexity = 15 in { + // Extract the low 64-bit value from one vector and insert it into another. + def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)), + (MOVSDrr (v2f64 VR128:$src1), + (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; + def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)), + (MOVSDrr (v2i64 VR128:$src1), + (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>; + + // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd + def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>; + def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>; + + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVSD to the lower bits. + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), + (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + } + + let AddedComplexity = 20 in { + // MOVSDrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), + (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), + (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>; + } + + // Extract and store. + def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), + addr:$dst), + (MOVSDmr addr:$dst, + (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>; + + // Shuffle with MOVSD + def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))), + (MOVSDrr VR128:$src1, FR64:$src2)>; + def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr (v2i64 VR128:$src1), + (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr (v2f64 VR128:$src1), + (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; + def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>; + def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>; + + // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem + // is during lowering, where it's not possible to recognize the fold cause + // it has two uses through a bitcast. One use disappears at isel time and the + // fold opportunity reappears. + def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>; + def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), + (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>; +} + +let Predicates = [HasAVX] in { + let AddedComplexity = 15 in { + // Extract the low 32-bit value from one vector and insert it into another. + def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)), + (VMOVSSrr (v4f32 VR128:$src1), + (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; + def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)), + (VMOVSSrr (v4i32 VR128:$src1), + (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>; + + // Extract the low 64-bit value from one vector and insert it into another. + def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)), + (VMOVSDrr (v2f64 VR128:$src1), + (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; + def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)), + (VMOVSDrr (v2i64 VR128:$src1), + (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>; + + // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd + def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>; + def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>; + + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVS{S,D} to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), + (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (VMOVSSrr (v4f32 (V_SET0)), + (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (VMOVSSrr (v4i32 (V_SET0)), + (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), + (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + } + + let AddedComplexity = 20 in { + // MOVSSrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; + def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; + def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; + + // MOVSDrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), + (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), + (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), + (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; + def : Pat<(v2f64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>; + + // Represent the same patterns above but in the form they appear for + // 256-bit types + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_sd)>; + } + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector FR32:$src)), (i32 0)))), + (SUBREG_TO_REG (i32 0), + (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), + sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector FR64:$src)), (i32 0)))), + (SUBREG_TO_REG (i64 0), + (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), + sub_xmm)>; + + // Extract and store. + def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), + addr:$dst), + (VMOVSSmr addr:$dst, + (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; + def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), + addr:$dst), + (VMOVSDmr addr:$dst, + (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>; + + // Shuffle with VMOVSS + def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))), + (VMOVSSrr VR128:$src1, FR32:$src2)>; + def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), + (VMOVSSrr (v4i32 VR128:$src1), + (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>; + def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), + (VMOVSSrr (v4f32 VR128:$src1), + (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>; + + // Shuffle with VMOVSD + def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))), + (VMOVSDrr VR128:$src1, FR64:$src2)>; + def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr (v2i64 VR128:$src1), + (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr (v2f64 VR128:$src1), + (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>; + def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), + sub_sd))>; + def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), + sub_sd))>; + + // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem + // is during lowering, where it's not possible to recognize the fold cause + // it has two uses through a bitcast. One use disappears at isel time and the + // fold opportunity reappears. + def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), + sub_sd))>; + def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), + (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), + sub_sd))>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions +//===----------------------------------------------------------------------===// + +multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, + X86MemOperand x86memop, PatFrag ld_frag, + string asm, Domain d, + bit IsReMaterializable = 1> { +let neverHasSideEffects = 1 in + def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>; +let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in + def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (ld_frag addr:$src))], d>; +} + +defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, + "movaps", SSEPackedSingle>, TB, VEX; +defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, + "movapd", SSEPackedDouble>, TB, OpSize, VEX; +defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, + "movups", SSEPackedSingle>, TB, VEX; +defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, + "movupd", SSEPackedDouble, 0>, TB, OpSize, VEX; + +defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, + "movaps", SSEPackedSingle>, TB, VEX; +defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, + "movapd", SSEPackedDouble>, TB, OpSize, VEX; +defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, + "movups", SSEPackedSingle>, TB, VEX; +defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, + "movupd", SSEPackedDouble, 0>, TB, OpSize, VEX; +defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, + "movaps", SSEPackedSingle>, TB; +defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, + "movapd", SSEPackedDouble>, TB, OpSize; +defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, + "movups", SSEPackedSingle>, TB; +defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, + "movupd", SSEPackedDouble, 0>, TB, OpSize; + +def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, VEX; +def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, VEX; +def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movups\t{$src, $dst|$dst, $src}", + [(store (v4f32 VR128:$src), addr:$dst)]>, VEX; +def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", + [(store (v2f64 VR128:$src), addr:$dst)]>, VEX; +def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(alignedstore256 (v8f32 VR256:$src), addr:$dst)]>, VEX; +def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(alignedstore256 (v4f64 VR256:$src), addr:$dst)]>, VEX; +def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movups\t{$src, $dst|$dst, $src}", + [(store (v8f32 VR256:$src), addr:$dst)]>, VEX; +def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movupd\t{$src, $dst|$dst, $src}", + [(store (v4f64 VR256:$src), addr:$dst)]>, VEX; + +// For disassembler +let isCodeGenOnly = 1 in { + def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", []>, VEX; + def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", []>, VEX; + def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movups\t{$src, $dst|$dst, $src}", []>, VEX; + def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", []>, VEX; + def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movaps\t{$src, $dst|$dst, $src}", []>, VEX; + def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movapd\t{$src, $dst|$dst, $src}", []>, VEX; + def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movups\t{$src, $dst|$dst, $src}", []>, VEX; + def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movupd\t{$src, $dst|$dst, $src}", []>, VEX; +} + +def : Pat<(int_x86_avx_loadu_ps_256 addr:$src), (VMOVUPSYrm addr:$src)>; +def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + +def : Pat<(int_x86_avx_loadu_pd_256 addr:$src), (VMOVUPDYrm addr:$src)>; +def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src), + (VMOVUPDYmr addr:$dst, VR256:$src)>; + +def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(alignedstore (v4f32 VR128:$src), addr:$dst)]>; +def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(alignedstore (v2f64 VR128:$src), addr:$dst)]>; +def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movups\t{$src, $dst|$dst, $src}", + [(store (v4f32 VR128:$src), addr:$dst)]>; +def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", + [(store (v2f64 VR128:$src), addr:$dst)]>; + +// For disassembler +let isCodeGenOnly = 1 in { + def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", []>; + def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", []>; + def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movups\t{$src, $dst|$dst, $src}", []>; + def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", []>; +} + +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), + (VMOVUPDmr addr:$dst, VR128:$src)>; +} + +let Predicates = [HasSSE1] in + def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src), + (MOVUPSmr addr:$dst, VR128:$src)>; +let Predicates = [HasSSE2] in + def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src), + (MOVUPDmr addr:$dst, VR128:$src)>; + +// Use movaps / movups for SSE integer load / store (one byte shorter). +// The instructions selected below are then converted to MOVDQA/MOVDQU +// during the SSE domain pass. +let Predicates = [HasSSE1] in { + def : Pat<(alignedloadv4i32 addr:$src), + (MOVAPSrm addr:$src)>; + def : Pat<(loadv4i32 addr:$src), + (MOVUPSrm addr:$src)>; + def : Pat<(alignedloadv2i64 addr:$src), + (MOVAPSrm addr:$src)>; + def : Pat<(loadv2i64 addr:$src), + (MOVUPSrm addr:$src)>; + + def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v2i64 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v4i32 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8i16 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v16i8 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; +} + +// Use vmovaps/vmovups for AVX integer load/store. +let Predicates = [HasAVX] in { + // 128-bit load/store + def : Pat<(alignedloadv4i32 addr:$src), + (VMOVAPSrm addr:$src)>; + def : Pat<(loadv4i32 addr:$src), + (VMOVUPSrm addr:$src)>; + def : Pat<(alignedloadv2i64 addr:$src), + (VMOVAPSrm addr:$src)>; + def : Pat<(loadv2i64 addr:$src), + (VMOVUPSrm addr:$src)>; + + def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), + (VMOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v2i64 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v4i32 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8i16 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v16i8 VR128:$src), addr:$dst), + (VMOVUPSmr addr:$dst, VR128:$src)>; + + // 256-bit load/store + def : Pat<(alignedloadv4i64 addr:$src), + (VMOVAPSYrm addr:$src)>; + def : Pat<(loadv4i64 addr:$src), + (VMOVUPSYrm addr:$src)>; + def : Pat<(alignedloadv8i32 addr:$src), + (VMOVAPSYrm addr:$src)>; + def : Pat<(loadv8i32 addr:$src), + (VMOVUPSYrm addr:$src)>; + def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v4i64 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v8i32 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v16i16 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v32i8 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; +} + +// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper +// bits are disregarded. FIXME: Set encoding to pseudo! +let neverHasSideEffects = 1 in { +def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), + "movaps\t{$src, $dst|$dst, $src}", []>; +def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), + "movapd\t{$src, $dst|$dst, $src}", []>; +def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), + "movaps\t{$src, $dst|$dst, $src}", []>, VEX; +def FsVMOVAPDrr : VPDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), + "movapd\t{$src, $dst|$dst, $src}", []>, VEX; +} + +// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper +// bits are disregarded. FIXME: Set encoding to pseudo! +let canFoldAsLoad = 1, isReMaterializable = 1 in { +def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>; +def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>; +let isCodeGenOnly = 1 in { + def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>, VEX; + def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>, VEX; +} +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Low packed FP Instructions +//===----------------------------------------------------------------------===// + +multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC, + PatFrag mov_frag, string base_opc, + string asm_opr> { + def PSrm : PI<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), + !strconcat(base_opc, "s", asm_opr), + [(set RC:$dst, + (mov_frag RC:$src1, + (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))], + SSEPackedSingle>, TB; + + def PDrm : PI<opc, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, f64mem:$src2), + !strconcat(base_opc, "d", asm_opr), + [(set RC:$dst, (v2f64 (mov_frag RC:$src1, + (scalar_to_vector (loadf64 addr:$src2)))))], + SSEPackedDouble>, TB, OpSize; +} + +let AddedComplexity = 20 in { + defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp", + "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V; +} +let Constraints = "$src1 = $dst", AddedComplexity = 20 in { + defm MOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp", + "\t{$src2, $dst|$dst, $src2}">; +} + +def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), + (iPTR 0))), addr:$dst)]>, VEX; +def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (v2f64 VR128:$src), + (iPTR 0))), addr:$dst)]>, VEX; +def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), + (iPTR 0))), addr:$dst)]>; +def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (v2f64 VR128:$src), + (iPTR 0))), addr:$dst)]>; + +let Predicates = [HasAVX] in { + let AddedComplexity = 20 in { + // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS + def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))), + (VMOVLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))), + (VMOVLPSrm VR128:$src1, addr:$src2)>; + // vector_shuffle v1, (load v2) <2, 1> using MOVLPS + def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; + } + + // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS + def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1), + (VMOVLPSmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)), + VR128:$src2)), addr:$src1), + (VMOVLPSmr addr:$src1, VR128:$src2)>; + + // (store (vector_shuffle (load addr), v2, <2, 1>), addr) using MOVLPS + def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1), + (VMOVLPDmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1), + (VMOVLPDmr addr:$src1, VR128:$src2)>; + + // Shuffle with VMOVLPS + def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), + (VMOVLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), + (VMOVLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlps VR128:$src1, + (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (VMOVLPSrm VR128:$src1, addr:$src2)>; + + // Shuffle with VMOVLPD + def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movlpd VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; + + // Store patterns + def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), + addr:$src1), + (VMOVLPSmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v4i32 (X86Movlps + (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1), + (VMOVLPSmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), + addr:$src1), + (VMOVLPDmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)), + addr:$src1), + (VMOVLPDmr addr:$src1, VR128:$src2)>; +} + +let Predicates = [HasSSE1] in { + let AddedComplexity = 20 in { + // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS + def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))), + (MOVLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))), + (MOVLPSrm VR128:$src1, addr:$src2)>; + } + + // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS + def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1), + (MOVLPSmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)), + VR128:$src2)), addr:$src1), + (MOVLPSmr addr:$src1, VR128:$src2)>; + + // Shuffle with MOVLPS + def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), + (MOVLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), + (MOVLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlps VR128:$src1, + (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (MOVLPSrm VR128:$src1, addr:$src2)>; + + // Store patterns + def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), + addr:$src1), + (MOVLPSmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v4i32 (X86Movlps + (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), + addr:$src1), + (MOVLPSmr addr:$src1, VR128:$src2)>; +} + +let Predicates = [HasSSE2] in { + let AddedComplexity = 20 in { + // vector_shuffle v1, (load v2) <2, 1> using MOVLPS + def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))), + (MOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))), + (MOVLPDrm VR128:$src1, addr:$src2)>; + } + + // (store (vector_shuffle (load addr), v2, <2, 1>), addr) using MOVLPS + def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1), + (MOVLPDmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1), + (MOVLPDmr addr:$src1, VR128:$src2)>; + + // Shuffle with MOVLPD + def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), + (MOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), + (MOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movlpd VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)))), + (MOVLPDrm VR128:$src1, addr:$src2)>; + + // Store patterns + def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), + addr:$src1), + (MOVLPDmr addr:$src1, VR128:$src2)>; + def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)), + addr:$src1), + (MOVLPDmr addr:$src1, VR128:$src2)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Hi packed FP Instructions +//===----------------------------------------------------------------------===// + +let AddedComplexity = 20 in { + defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp", + "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V; +} +let Constraints = "$src1 = $dst", AddedComplexity = 20 in { + defm MOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp", + "\t{$src2, $dst|$dst, $src2}">; +} + +// v2f64 extract element 1 is always custom lowered to unpack high to low +// and extract element 0 so the non-store version isn't too horrible. +def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (unpckh (bc_v2f64 (v4f32 VR128:$src)), + (undef)), (iPTR 0))), addr:$dst)]>, + VEX; +def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (v2f64 (unpckh VR128:$src, (undef))), + (iPTR 0))), addr:$dst)]>, + VEX; +def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (unpckh (bc_v2f64 (v4f32 VR128:$src)), + (undef)), (iPTR 0))), addr:$dst)]>; +def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (v2f64 (unpckh VR128:$src, (undef))), + (iPTR 0))), addr:$dst)]>; + +let Predicates = [HasAVX] in { + // VMOVHPS patterns + def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (VMOVHPSrm (v4i32 VR128:$src1), addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (VMOVHPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (VMOVHPSrm VR128:$src1, addr:$src2)>; + + // FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem + // is during lowering, where it's not possible to recognize the load fold cause + // it has two uses through a bitcast. One use disappears at isel time and the + // fold opportunity reappears. + def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)))), + (VMOVHPDrm VR128:$src1, addr:$src2)>; + + // FIXME: This should be matched by a X86Movhpd instead. Same as above + def : Pat<(v2f64 (X86Movlhpd VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)))), + (VMOVHPDrm VR128:$src1, addr:$src2)>; + + // Store patterns + def : Pat<(store (f64 (vector_extract + (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))), addr:$dst), + (VMOVHPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (f64 (vector_extract + (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))), addr:$dst), + (VMOVHPDmr addr:$dst, VR128:$src)>; +} + +let Predicates = [HasSSE1] in { + // MOVHPS patterns + def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (MOVHPSrm VR128:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128:$src1, + (bc_v4f32 (v2i64 (X86vzload addr:$src2)))), + (MOVHPSrm VR128:$src1, addr:$src2)>; + + // Store patterns + def : Pat<(store (f64 (vector_extract + (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))), addr:$dst), + (MOVHPSmr addr:$dst, VR128:$src)>; +} + +let Predicates = [HasSSE2] in { + // FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem + // is during lowering, where it's not possible to recognize the load fold cause + // it has two uses through a bitcast. One use disappears at isel time and the + // fold opportunity reappears. + def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)))), + (MOVHPDrm VR128:$src1, addr:$src2)>; + + // FIXME: This should be matched by a X86Movhpd instead. Same as above + def : Pat<(v2f64 (X86Movlhpd VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)))), + (MOVHPDrm VR128:$src1, addr:$src2)>; + + // Store patterns + def : Pat<(store (f64 (vector_extract + (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))),addr:$dst), + (MOVHPDmr addr:$dst, VR128:$src)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions +//===----------------------------------------------------------------------===// + +let AddedComplexity = 20 in { + def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (v4f32 (movlhps VR128:$src1, VR128:$src2)))]>, + VEX_4V; + def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>, + VEX_4V; +} +let Constraints = "$src1 = $dst", AddedComplexity = 20 in { + def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movlhps\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (movlhps VR128:$src1, VR128:$src2)))]>; + def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movhlps\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>; +} + +let Predicates = [HasAVX] in { + // MOVLHPS patterns + let AddedComplexity = 20 in { + def : Pat<(v4f32 (movddup VR128:$src, (undef))), + (VMOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>; + def : Pat<(v2i64 (movddup VR128:$src, (undef))), + (VMOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>; + + // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS + def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)), + (VMOVLHPSrr VR128:$src1, VR128:$src2)>; + } + def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)), + (VMOVLHPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), + (VMOVLHPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), + (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; + + // MOVHLPS patterns + let AddedComplexity = 20 in { + // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS + def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)), + (VMOVHLPSrr VR128:$src1, VR128:$src2)>; + + // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS + def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))), + (VMOVHLPSrr VR128:$src1, VR128:$src1)>; + def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))), + (VMOVHLPSrr VR128:$src1, VR128:$src1)>; + } + + def : Pat<(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)), + (VMOVHLPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)), + (VMOVHLPSrr VR128:$src1, VR128:$src2)>; +} + +let Predicates = [HasSSE1] in { + // MOVLHPS patterns + let AddedComplexity = 20 in { + def : Pat<(v4f32 (movddup VR128:$src, (undef))), + (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>; + def : Pat<(v2i64 (movddup VR128:$src, (undef))), + (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>; + + // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS + def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)), + (MOVLHPSrr VR128:$src1, VR128:$src2)>; + } + def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)), + (MOVLHPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), + (MOVLHPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), + (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; + + // MOVHLPS patterns + let AddedComplexity = 20 in { + // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS + def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)), + (MOVHLPSrr VR128:$src1, VR128:$src2)>; + + // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS + def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))), + (MOVHLPSrr VR128:$src1, VR128:$src1)>; + def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))), + (MOVHLPSrr VR128:$src1, VR128:$src1)>; + } + + def : Pat<(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)), + (MOVHLPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)), + (MOVHLPSrr VR128:$src1, VR128:$src2)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Conversion Instructions +//===----------------------------------------------------------------------===// + +multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, + string asm> { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, + [(set DstRC:$dst, (OpNode SrcRC:$src))]>; + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, + [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>; +} + +multiclass sse12_cvt_s_np<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + X86MemOperand x86memop, string asm> { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, []>; + let mayLoad = 1 in + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, []>; +} + +multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, + string asm, Domain d> { + def rr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, + [(set DstRC:$dst, (OpNode SrcRC:$src))], d>; + def rm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, + [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))], d>; +} + +multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + X86MemOperand x86memop, string asm> { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>; + let mayLoad = 1 in + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src), + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>; +} + +defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX, + VEX_LIG; +defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX, + VEX_W, VEX_LIG; +defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, + VEX_LIG; +defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, + VEX, VEX_W, VEX_LIG; + +// The assembler can recognize rr 64-bit instructions by seeing a rxx +// register, but the same isn't true when only using memory operands, +// provide other assembly "l" and "q" forms to address this explicitly +// where appropriate to do so. +defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss">, XS, + VEX_4V, VEX_LIG; +defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">, XS, + VEX_4V, VEX_W, VEX_LIG; +defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd">, XD, + VEX_4V, VEX_LIG; +defm VCVTSI2SDL : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, XD, + VEX_4V, VEX_LIG; +defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, XD, + VEX_4V, VEX_W, VEX_LIG; + +let Predicates = [HasAVX] in { + def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), + (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; + def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), + (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>; + def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), + (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; + def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), + (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>; + + def : Pat<(f32 (sint_to_fp GR32:$src)), + (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; + def : Pat<(f32 (sint_to_fp GR64:$src)), + (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>; + def : Pat<(f64 (sint_to_fp GR32:$src)), + (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; + def : Pat<(f64 (sint_to_fp GR64:$src)), + (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>; +} + +defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}">, XS; +defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, + "cvttss2si{q}\t{$src, $dst|$dst, $src}">, XS, REX_W; +defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}">, XD; +defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, + "cvttsd2si{q}\t{$src, $dst|$dst, $src}">, XD, REX_W; +defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, + "cvtsi2ss\t{$src, $dst|$dst, $src}">, XS; +defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, + "cvtsi2ss{q}\t{$src, $dst|$dst, $src}">, XS, REX_W; +defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, + "cvtsi2sd\t{$src, $dst|$dst, $src}">, XD; +defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, + "cvtsi2sd{q}\t{$src, $dst|$dst, $src}">, XD, REX_W; + +// Conversion Instructions Intrinsics - Match intrinsics which expect MM +// and/or XMM operand(s). + +multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, + string asm> { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (Int SrcRC:$src))]>; + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (Int (ld_frag addr:$src)))]>; +} + +multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, + RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, + PatFrag ld_frag, string asm, bit Is2Addr = 1> { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))]>; + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))]>; +} + +defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, + f128mem, load, "cvtsd2si">, XD, VEX; +defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, + int_x86_sse2_cvtsd2si64, f128mem, load, "cvtsd2si">, + XD, VEX, VEX_W; + +// FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_ +// Get rid of this hack or rename the intrinsics, there are several +// intructions that only match with the intrinsic form, why create duplicates +// to let them be recognized by the assembler? +defm VCVTSD2SI : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem, + "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_LIG; +defm VCVTSD2SI64 : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem, + "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W, + VEX_LIG; + +defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, + f128mem, load, "cvtsd2si{l}">, XD; +defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, + f128mem, load, "cvtsd2si{q}">, XD, REX_W; + + +defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss", 0>, XS, VEX_4V; +defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss", 0>, XS, VEX_4V, + VEX_W; +defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd", 0>, XD, VEX_4V; +defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd", 0>, XD, + VEX_4V, VEX_W; + +let Constraints = "$src1 = $dst" in { + defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse_cvtsi2ss, i32mem, loadi32, + "cvtsi2ss">, XS; + defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse_cvtsi642ss, i64mem, loadi64, + "cvtsi2ss{q}">, XS, REX_W; + defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + int_x86_sse2_cvtsi2sd, i32mem, loadi32, + "cvtsi2sd">, XD; + defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + int_x86_sse2_cvtsi642sd, i64mem, loadi64, + "cvtsi2sd">, XD, REX_W; +} + +/// SSE 1 Only + +// Aliases for intrinsics +defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, + f32mem, load, "cvttss2si">, XS, VEX; +defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse_cvttss2si64, f32mem, load, + "cvttss2si">, XS, VEX, VEX_W; +defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, + f128mem, load, "cvttsd2si">, XD, VEX; +defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse2_cvttsd2si64, f128mem, load, + "cvttsd2si">, XD, VEX, VEX_W; +defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, + f32mem, load, "cvttss2si">, XS; +defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse_cvttss2si64, f32mem, load, + "cvttss2si{q}">, XS, REX_W; +defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, + f128mem, load, "cvttsd2si">, XD; +defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse2_cvttsd2si64, f128mem, load, + "cvttsd2si{q}">, XD, REX_W; + +let Pattern = []<dag> in { +defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load, + "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS, + VEX, VEX_LIG; +defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load, + "cvtss2si\t{$src, $dst|$dst, $src}">, XS, VEX, + VEX_W, VEX_LIG; +defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load, + "cvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle>, TB, VEX; +defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, undef, i256mem, load, + "cvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle>, TB, VEX; +} + +let Pattern = []<dag> in { +defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load /*dummy*/, + "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS; +defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load /*dummy*/, + "cvtss2si{q}\t{$src, $dst|$dst, $src}">, XS, REX_W; +defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/, + "cvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle>, TB; /* PD SSE3 form is avaiable */ +} + +let Predicates = [HasSSE1] in { + def : Pat<(int_x86_sse_cvtss2si VR128:$src), + (CVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; + def : Pat<(int_x86_sse_cvtss2si (load addr:$src)), + (CVTSS2SIrm addr:$src)>; + def : Pat<(int_x86_sse_cvtss2si64 VR128:$src), + (CVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; + def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)), + (CVTSS2SI64rm addr:$src)>; +} + +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse_cvtss2si VR128:$src), + (VCVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; + def : Pat<(int_x86_sse_cvtss2si (load addr:$src)), + (VCVTSS2SIrm addr:$src)>; + def : Pat<(int_x86_sse_cvtss2si64 VR128:$src), + (VCVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>; + def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)), + (VCVTSS2SI64rm addr:$src)>; +} + +/// SSE 2 Only + +// Convert scalar double to scalar single +def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), + (ins FR64:$src1, FR64:$src2), + "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + VEX_4V, VEX_LIG; +let mayLoad = 1 in +def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), + (ins FR64:$src1, f64mem:$src2), + "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG; + +def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>, + Requires<[HasAVX]>; + +def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), + "cvtsd2ss\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (fround FR64:$src))]>; +def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), + "cvtsd2ss\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD, + Requires<[HasSSE2, OptForSize]>; + +defm Int_VCVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128, + int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss", 0>, + XS, VEX_4V; +let Constraints = "$src1 = $dst" in +defm Int_CVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128, + int_x86_sse2_cvtsd2ss, f64mem, load, "cvtsd2ss">, XS; + +// Convert scalar single to scalar double +// SSE2 instructions with XS prefix +def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), + (ins FR32:$src1, FR32:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG; +let mayLoad = 1 in +def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), + (ins FR32:$src1, f32mem:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>; + +let Predicates = [HasAVX] in { + def : Pat<(f64 (fextend FR32:$src)), + (VCVTSS2SDrr FR32:$src, FR32:$src)>; + def : Pat<(fextend (loadf32 addr:$src)), + (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>; + def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>; +} + +def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (MOVSSrm addr:$src))>, + Requires<[HasAVX, OptForSpeed]>; + +def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), + "cvtss2sd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (fextend FR32:$src))]>, XS, + Requires<[HasSSE2]>; +def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), + "cvtss2sd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (extloadf32 addr:$src))]>, XS, + Requires<[HasSSE2, OptForSize]>; + +// extload f32 -> f64. This matches load+fextend because we have a hack in +// the isel (PreprocessForFPConvert) that can introduce loads after dag +// combine. +// Since these loads aren't folded into the fextend, we have to match it +// explicitly here. +def : Pat<(fextend (loadf32 addr:$src)), + (CVTSS2SDrm addr:$src)>, Requires<[HasSSE2]>; +def : Pat<(extloadf32 addr:$src), + (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[HasSSE2, OptForSpeed]>; + +def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, + VR128:$src2))]>, XS, VEX_4V, + Requires<[HasAVX]>; +def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, + (load addr:$src2)))]>, XS, VEX_4V, + Requires<[HasAVX]>; +let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix +def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "cvtss2sd\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, + VR128:$src2))]>, XS, + Requires<[HasSSE2]>; +def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2), + "cvtss2sd\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, + (load addr:$src2)))]>, XS, + Requires<[HasSSE2]>; +} + +// Convert doubleword to packed single/double fp +// SSE2 instructions without OpSize prefix +def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtdq2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>, + TB, VEX, Requires<[HasAVX]>; +def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vcvtdq2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtdq2ps + (bitconvert (memopv2i64 addr:$src))))]>, + TB, VEX, Requires<[HasAVX]>; +def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtdq2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>, + TB, Requires<[HasSSE2]>; +def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "cvtdq2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtdq2ps + (bitconvert (memopv2i64 addr:$src))))]>, + TB, Requires<[HasSSE2]>; + +// FIXME: why the non-intrinsic version is described as SSE3? +// SSE2 instructions with XS prefix +def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>, + XS, VEX, Requires<[HasAVX]>; +def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtdq2pd + (bitconvert (memopv2i64 addr:$src))))]>, + XS, VEX, Requires<[HasAVX]>; +def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>, + XS, Requires<[HasSSE2]>; +def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "cvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtdq2pd + (bitconvert (memopv2i64 addr:$src))))]>, + XS, Requires<[HasSSE2]>; + + +// Convert packed single/double fp to doubleword +def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; +def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", []>; +def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", []>; + +def Int_VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>, + VEX; +def Int_VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), + (ins f128mem:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2dq + (memop addr:$src)))]>, VEX; +def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>; +def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2dq + (memop addr:$src)))]>; + +// SSE2 packed instructions with XD prefix +def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, + XD, VEX, Requires<[HasAVX]>; +def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "vcvtpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2dq + (memop addr:$src)))]>, + XD, VEX, Requires<[HasAVX]>; +def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, + XD, Requires<[HasSSE2]>; +def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2dq + (memop addr:$src)))]>, + XD, Requires<[HasSSE2]>; + + +// Convert with truncation packed single/double fp to doubleword +// SSE2 packed instructions with XS prefix +def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; +let mayLoad = 1 in +def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; +let mayLoad = 1 in +def VCVTTPS2DQYrm : VSSI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; +def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvttps2dq VR128:$src))]>; +def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvttps2dq (memop addr:$src)))]>; + +def Int_VCVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvttps2dq VR128:$src))]>, + XS, VEX, Requires<[HasAVX]>; +def Int_VCVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "vcvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttps2dq + (memop addr:$src)))]>, + XS, VEX, Requires<[HasAVX]>; + +let Predicates = [HasSSE2] in { + def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), + (Int_CVTDQ2PSrr VR128:$src)>; + def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), + (CVTTPS2DQrr VR128:$src)>; +} + +let Predicates = [HasAVX] in { + def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), + (Int_VCVTDQ2PSrr VR128:$src)>; + def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), + (VCVTTPS2DQrr VR128:$src)>; + def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))), + (VCVTDQ2PSYrr VR256:$src)>; + def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))), + (VCVTTPS2DQYrr VR256:$src)>; +} + +def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (int_x86_sse2_cvttpd2dq VR128:$src))]>, VEX; +let isCodeGenOnly = 1 in +def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttpd2dq + (memop addr:$src)))]>, VEX; +def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>; +def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvttpd2dq + (memop addr:$src)))]>; + +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. +def VCVTTPD2DQXrYr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", []>, VEX; + +// XMM only +def VCVTTPD2DQXrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvttpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX; + +// YMM only +def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; + +// Convert packed single to packed double +let Predicates = [HasAVX] in { + // SSE2 instructions without OpSize prefix +def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX; +def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX; +def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX; +def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX; +} +def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB; +def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), + "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB; + +def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>, + TB, VEX, Requires<[HasAVX]>; +def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2pd + (load addr:$src)))]>, + TB, VEX, Requires<[HasAVX]>; +def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>, + TB, Requires<[HasSSE2]>; +def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), + "cvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtps2pd + (load addr:$src)))]>, + TB, Requires<[HasSSE2]>; + +// Convert packed double to packed single +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. +def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPD2PSXrYr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", []>, VEX; + +// XMM only +def VCVTPD2PSXrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2psx\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2psx\t{$src, $dst|$dst, $src}", []>, VEX; + +// YMM only +def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; +def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", []>; +def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", []>; + + +def Int_VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>; +def Int_VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), + (ins f128mem:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2ps + (memop addr:$src)))]>; +def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>; +def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse2_cvtpd2ps + (memop addr:$src)))]>; + +// AVX 256-bit register conversion intrinsics +// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below +// whenever possible to avoid declaring two versions of each one. +def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src), + (VCVTDQ2PSYrr VR256:$src)>; +def : Pat<(int_x86_avx_cvtdq2_ps_256 (memopv8i32 addr:$src)), + (VCVTDQ2PSYrm addr:$src)>; + +def : Pat<(int_x86_avx_cvt_pd2_ps_256 VR256:$src), + (VCVTPD2PSYrr VR256:$src)>; +def : Pat<(int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)), + (VCVTPD2PSYrm addr:$src)>; + +def : Pat<(int_x86_avx_cvt_ps2dq_256 VR256:$src), + (VCVTPS2DQYrr VR256:$src)>; +def : Pat<(int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)), + (VCVTPS2DQYrm addr:$src)>; + +def : Pat<(int_x86_avx_cvt_ps2_pd_256 VR128:$src), + (VCVTPS2PDYrr VR128:$src)>; +def : Pat<(int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)), + (VCVTPS2PDYrm addr:$src)>; + +def : Pat<(int_x86_avx_cvtt_pd2dq_256 VR256:$src), + (VCVTTPD2DQYrr VR256:$src)>; +def : Pat<(int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)), + (VCVTTPD2DQYrm addr:$src)>; + +def : Pat<(int_x86_avx_cvtt_ps2dq_256 VR256:$src), + (VCVTTPS2DQYrr VR256:$src)>; +def : Pat<(int_x86_avx_cvtt_ps2dq_256 (memopv8f32 addr:$src)), + (VCVTTPS2DQYrm addr:$src)>; + +// Match fround and fextend for 128/256-bit conversions +def : Pat<(v4f32 (fround (v4f64 VR256:$src))), + (VCVTPD2PSYrr VR256:$src)>; +def : Pat<(v4f32 (fround (loadv4f64 addr:$src))), + (VCVTPD2PSYrm addr:$src)>; + +def : Pat<(v4f64 (fextend (v4f32 VR128:$src))), + (VCVTPS2PDYrr VR128:$src)>; +def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))), + (VCVTPS2PDYrm addr:$src)>; + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Compare Instructions +//===----------------------------------------------------------------------===// + +// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions +multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, + SDNode OpNode, ValueType VT, PatFrag ld_frag, + string asm, string asm_alt> { + def rr : SIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, SSECC:$cc), asm, + [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>; + def rm : SIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, SSECC:$cc), asm, + [(set RC:$dst, (OpNode (VT RC:$src1), + (ld_frag addr:$src2), imm:$cc))]>; + + // Accept explicit immediate argument form instead of comparison code. + let neverHasSideEffects = 1 in { + def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, []>; + let mayLoad = 1 in + def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, []>; + } +} + +defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmpss, f32, loadf32, + "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, + XS, VEX_4V, VEX_LIG; +defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmpsd, f64, loadf64, + "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">, + XD, VEX_4V, VEX_LIG; + +let Constraints = "$src1 = $dst" in { + defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmpss, f32, loadf32, + "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", + "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}">, + XS; + defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmpsd, f64, loadf64, + "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", + "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}">, + XD; +} + +multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop, + Intrinsic Int, string asm> { + def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src, SSECC:$cc), asm, + [(set VR128:$dst, (Int VR128:$src1, + VR128:$src, imm:$cc))]>; + def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f32mem:$src, SSECC:$cc), asm, + [(set VR128:$dst, (Int VR128:$src1, + (load addr:$src), imm:$cc))]>; +} + +// Aliases to match intrinsics which expect XMM operand(s). +defm Int_VCMPSS : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss, + "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}">, + XS, VEX_4V; +defm Int_VCMPSD : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd, + "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}">, + XD, VEX_4V; +let Constraints = "$src1 = $dst" in { + defm Int_CMPSS : sse12_cmp_scalar_int<VR128, f32mem, int_x86_sse_cmp_ss, + "cmp${cc}ss\t{$src, $dst|$dst, $src}">, XS; + defm Int_CMPSD : sse12_cmp_scalar_int<VR128, f64mem, int_x86_sse2_cmp_sd, + "cmp${cc}sd\t{$src, $dst|$dst, $src}">, XD; +} + + +// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS +multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, + ValueType vt, X86MemOperand x86memop, + PatFrag ld_frag, string OpcodeStr, Domain d> { + def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], d>; + def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode (vt RC:$src1), + (ld_frag addr:$src2)))], d>; +} + +let Defs = [EFLAGS] in { + defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, + "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG; + defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, + "ucomisd", SSEPackedDouble>, TB, OpSize, VEX, + VEX_LIG; + let Pattern = []<dag> in { + defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, + "comiss", SSEPackedSingle>, TB, VEX, + VEX_LIG; + defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, + "comisd", SSEPackedDouble>, TB, OpSize, VEX, + VEX_LIG; + } + + defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, + load, "ucomiss", SSEPackedSingle>, TB, VEX; + defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, + load, "ucomisd", SSEPackedDouble>, TB, OpSize, VEX; + + defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, + load, "comiss", SSEPackedSingle>, TB, VEX; + defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, + load, "comisd", SSEPackedDouble>, TB, OpSize, VEX; + defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, + "ucomiss", SSEPackedSingle>, TB; + defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, + "ucomisd", SSEPackedDouble>, TB, OpSize; + + let Pattern = []<dag> in { + defm COMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, + "comiss", SSEPackedSingle>, TB; + defm COMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, + "comisd", SSEPackedDouble>, TB, OpSize; + } + + defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem, + load, "ucomiss", SSEPackedSingle>, TB; + defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem, + load, "ucomisd", SSEPackedDouble>, TB, OpSize; + + defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load, + "comiss", SSEPackedSingle>, TB; + defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load, + "comisd", SSEPackedDouble>, TB, OpSize; +} // Defs = [EFLAGS] + +// sse12_cmp_packed - sse 1 & 2 compared packed instructions +multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, + Intrinsic Int, string asm, string asm_alt, + Domain d> { + let isAsmParserOnly = 1 in { + def rri : PIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, SSECC:$cc), asm, + [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))], d>; + def rmi : PIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, f128mem:$src2, SSECC:$cc), asm, + [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))], d>; + } + + // Accept explicit immediate argument form instead of comparison code. + def rri_alt : PIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), + asm_alt, [], d>; + def rmi_alt : PIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, f128mem:$src2, i8imm:$cc), + asm_alt, [], d>; +} + +defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps, + "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSEPackedSingle>, TB, VEX_4V; +defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd, + "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSEPackedDouble>, TB, OpSize, VEX_4V; +defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256, + "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSEPackedSingle>, TB, VEX_4V; +defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256, + "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SSEPackedDouble>, TB, OpSize, VEX_4V; +let Constraints = "$src1 = $dst" in { + defm CMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps, + "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", + "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SSEPackedSingle>, TB; + defm CMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd, + "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", + "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SSEPackedDouble>, TB, OpSize; +} + +let Predicates = [HasSSE1] in { +def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), + (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; +def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), + (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; +} + +let Predicates = [HasSSE2] in { +def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), + (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; +def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), + (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; +} + +let Predicates = [HasAVX] in { +def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)), + (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>; +def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)), + (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>; +def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)), + (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>; +def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), + (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; + +def : Pat<(v8i32 (X86cmpps (v8f32 VR256:$src1), VR256:$src2, imm:$cc)), + (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>; +def : Pat<(v8i32 (X86cmpps (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)), + (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>; +def : Pat<(v4i64 (X86cmppd (v4f64 VR256:$src1), VR256:$src2, imm:$cc)), + (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>; +def : Pat<(v4i64 (X86cmppd (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)), + (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Shuffle Instructions +//===----------------------------------------------------------------------===// + +/// sse12_shuffle - sse 1 & 2 shuffle instructions +multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, + ValueType vt, string asm, PatFrag mem_frag, + Domain d, bit IsConvertibleToThreeAddress = 0> { + def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, f128mem:$src2, i8imm:$src3), asm, + [(set RC:$dst, (vt (shufp:$src3 + RC:$src1, (mem_frag addr:$src2))))], d>; + let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in + def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$src3), asm, + [(set RC:$dst, + (vt (shufp:$src3 RC:$src1, RC:$src2)))], d>; +} + +defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, + "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + memopv4f32, SSEPackedSingle>, TB, VEX_4V; +defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, + "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + memopv8f32, SSEPackedSingle>, TB, VEX_4V; +defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", + memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V; +defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}", + memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V; + +let Constraints = "$src1 = $dst" in { + defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, + "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", + memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>, + TB; + defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, + "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", + memopv2f64, SSEPackedDouble>, TB, OpSize; +} + +let Predicates = [HasSSE1] in { + def : Pat<(v4f32 (X86Shufps VR128:$src1, + (memopv4f32 addr:$src2), (i8 imm:$imm))), + (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; + def : Pat<(v4i32 (X86Shufps VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))), + (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; + // vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but + // fall back to this for SSE1) + def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))), + (SHUFPSrri VR128:$src2, VR128:$src1, + (SHUFFLE_get_shuf_imm VR128:$src3))>; + // Special unary SHUFPSrri case. + def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))), + (SHUFPSrri VR128:$src1, VR128:$src1, + (SHUFFLE_get_shuf_imm VR128:$src3))>; +} + +let Predicates = [HasSSE2] in { + // Special binary v4i32 shuffle cases with SHUFPS. + def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))), + (SHUFPSrri VR128:$src1, VR128:$src2, + (SHUFFLE_get_shuf_imm VR128:$src3))>; + def : Pat<(v4i32 (shufp:$src3 VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)))), + (SHUFPSrmi VR128:$src1, addr:$src2, + (SHUFFLE_get_shuf_imm VR128:$src3))>; + // Special unary SHUFPDrri cases. + def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))), + (SHUFPDrri VR128:$src1, VR128:$src1, + (SHUFFLE_get_shuf_imm VR128:$src3))>; + def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))), + (SHUFPDrri VR128:$src1, VR128:$src1, + (SHUFFLE_get_shuf_imm VR128:$src3))>; + // Special binary v2i64 shuffle cases using SHUFPDrri. + def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)), + (SHUFPDrri VR128:$src1, VR128:$src2, + (SHUFFLE_get_shuf_imm VR128:$src3))>; + // Generic SHUFPD patterns + def : Pat<(v2f64 (X86Shufps VR128:$src1, + (memopv2f64 addr:$src2), (i8 imm:$imm))), + (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; + def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; +} + +let Predicates = [HasAVX] in { + def : Pat<(v4f32 (X86Shufps VR128:$src1, + (memopv4f32 addr:$src2), (i8 imm:$imm))), + (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; + def : Pat<(v4i32 (X86Shufps VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))), + (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; + // vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but + // fall back to this for SSE1) + def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))), + (VSHUFPSrri VR128:$src2, VR128:$src1, + (SHUFFLE_get_shuf_imm VR128:$src3))>; + // Special unary SHUFPSrri case. + def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))), + (VSHUFPSrri VR128:$src1, VR128:$src1, + (SHUFFLE_get_shuf_imm VR128:$src3))>; + // Special binary v4i32 shuffle cases with SHUFPS. + def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))), + (VSHUFPSrri VR128:$src1, VR128:$src2, + (SHUFFLE_get_shuf_imm VR128:$src3))>; + def : Pat<(v4i32 (shufp:$src3 VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)))), + (VSHUFPSrmi VR128:$src1, addr:$src2, + (SHUFFLE_get_shuf_imm VR128:$src3))>; + // Special unary SHUFPDrri cases. + def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))), + (VSHUFPDrri VR128:$src1, VR128:$src1, + (SHUFFLE_get_shuf_imm VR128:$src3))>; + def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))), + (VSHUFPDrri VR128:$src1, VR128:$src1, + (SHUFFLE_get_shuf_imm VR128:$src3))>; + // Special binary v2i64 shuffle cases using SHUFPDrri. + def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)), + (VSHUFPDrri VR128:$src1, VR128:$src2, + (SHUFFLE_get_shuf_imm VR128:$src3))>; + + def : Pat<(v2f64 (X86Shufps VR128:$src1, + (memopv2f64 addr:$src2), (i8 imm:$imm))), + (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; + def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; + def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; + + // 256-bit patterns + def : Pat<(v8i32 (X86Shufps VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>; + def : Pat<(v8i32 (X86Shufps VR256:$src1, + (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))), + (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>; + + def : Pat<(v8f32 (X86Shufps VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>; + def : Pat<(v8f32 (X86Shufps VR256:$src1, + (memopv8f32 addr:$src2), (i8 imm:$imm))), + (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>; + + def : Pat<(v4i64 (X86Shufpd VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>; + def : Pat<(v4i64 (X86Shufpd VR256:$src1, + (memopv4i64 addr:$src2), (i8 imm:$imm))), + (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>; + + def : Pat<(v4f64 (X86Shufpd VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>; + def : Pat<(v4f64 (X86Shufpd VR256:$src1, + (memopv4f64 addr:$src2), (i8 imm:$imm))), + (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Unpack Instructions +//===----------------------------------------------------------------------===// + +/// sse12_unpack_interleave - sse 1 & 2 unpack and interleave +multiclass sse12_unpack_interleave<bits<8> opc, PatFrag OpNode, ValueType vt, + PatFrag mem_frag, RegisterClass RC, + X86MemOperand x86memop, string asm, + Domain d> { + def rr : PI<opc, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2), + asm, [(set RC:$dst, + (vt (OpNode RC:$src1, RC:$src2)))], d>; + def rm : PI<opc, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + asm, [(set RC:$dst, + (vt (OpNode RC:$src1, + (mem_frag addr:$src2))))], d>; +} + +let AddedComplexity = 10 in { + defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32, + VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, TB, VEX_4V; + defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64, + VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, TB, OpSize, VEX_4V; + defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32, + VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, TB, VEX_4V; + defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64, + VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, TB, OpSize, VEX_4V; + + defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32, + VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, TB, VEX_4V; + defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64, + VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, TB, OpSize, VEX_4V; + defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32, + VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedSingle>, TB, VEX_4V; + defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64, + VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SSEPackedDouble>, TB, OpSize, VEX_4V; + + let Constraints = "$src1 = $dst" in { + defm UNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32, + VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", + SSEPackedSingle>, TB; + defm UNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64, + VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", + SSEPackedDouble>, TB, OpSize; + defm UNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32, + VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", + SSEPackedSingle>, TB; + defm UNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64, + VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", + SSEPackedDouble>, TB, OpSize; + } // Constraints = "$src1 = $dst" +} // AddedComplexity + +let Predicates = [HasSSE1] in { + def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))), + (UNPCKLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)), + (UNPCKLPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))), + (UNPCKHPSrm VR128:$src1, addr:$src2)>; + def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)), + (UNPCKHPSrr VR128:$src1, VR128:$src2)>; +} + +let Predicates = [HasSSE2] in { + def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))), + (UNPCKLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)), + (UNPCKLPDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))), + (UNPCKHPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)), + (UNPCKHPDrr VR128:$src1, VR128:$src2)>; + + // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the + // problem is during lowering, where it's not possible to recognize the load + // fold cause it has two uses through a bitcast. One use disappears at isel + // time and the fold opportunity reappears. + def : Pat<(v2f64 (X86Movddup VR128:$src)), + (UNPCKLPDrr VR128:$src, VR128:$src)>; + + let AddedComplexity = 10 in + def : Pat<(splat_lo (v2f64 VR128:$src), (undef)), + (UNPCKLPDrr VR128:$src, VR128:$src)>; +} + +let Predicates = [HasAVX] in { + def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))), + (VUNPCKLPSrm VR128:$src1, addr:$src2)>; + def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)), + (VUNPCKLPSrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))), + (VUNPCKHPSrm VR128:$src1, addr:$src2)>; + def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)), + (VUNPCKHPSrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, (memopv8f32 addr:$src2))), + (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, VR256:$src2)), + (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86Unpcklpsy VR256:$src1, VR256:$src2)), + (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86Unpcklpsy VR256:$src1, (memopv8i32 addr:$src2))), + (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8f32 (X86Unpckhpsy VR256:$src1, (memopv8f32 addr:$src2))), + (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8f32 (X86Unpckhpsy VR256:$src1, VR256:$src2)), + (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86Unpckhpsy VR256:$src1, (memopv8i32 addr:$src2))), + (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86Unpckhpsy VR256:$src1, VR256:$src2)), + (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))), + (VUNPCKLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)), + (VUNPCKLPDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))), + (VUNPCKHPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)), + (VUNPCKHPDrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, (memopv4f64 addr:$src2))), + (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, VR256:$src2)), + (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4i64 (X86Unpcklpdy VR256:$src1, (memopv4i64 addr:$src2))), + (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpcklpdy VR256:$src1, VR256:$src2)), + (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4f64 (X86Unpckhpdy VR256:$src1, (memopv4f64 addr:$src2))), + (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4f64 (X86Unpckhpdy VR256:$src1, VR256:$src2)), + (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, (memopv4i64 addr:$src2))), + (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, VR256:$src2)), + (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; + + // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the + // problem is during lowering, where it's not possible to recognize the load + // fold cause it has two uses through a bitcast. One use disappears at isel + // time and the fold opportunity reappears. + def : Pat<(v2f64 (X86Movddup VR128:$src)), + (VUNPCKLPDrr VR128:$src, VR128:$src)>; + let AddedComplexity = 10 in + def : Pat<(splat_lo (v2f64 VR128:$src), (undef)), + (VUNPCKLPDrr VR128:$src, VR128:$src)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Extract Floating-Point Sign mask +//===----------------------------------------------------------------------===// + +/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave +multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm, + Domain d> { + def rr32 : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins RC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set GR32:$dst, (Int RC:$src))], d>; + def rr64 : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins RC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, REX_W; +} + +defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps", + SSEPackedSingle>, TB; +defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd", + SSEPackedDouble>, TB, OpSize; + +def : Pat<(i32 (X86fgetsign FR32:$src)), + (MOVMSKPSrr32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, + sub_ss))>, Requires<[HasSSE1]>; +def : Pat<(i64 (X86fgetsign FR32:$src)), + (MOVMSKPSrr64 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, + sub_ss))>, Requires<[HasSSE1]>; +def : Pat<(i32 (X86fgetsign FR64:$src)), + (MOVMSKPDrr32 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, + sub_sd))>, Requires<[HasSSE2]>; +def : Pat<(i64 (X86fgetsign FR64:$src)), + (MOVMSKPDrr64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, + sub_sd))>, Requires<[HasSSE2]>; + +let Predicates = [HasAVX] in { + defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, + "movmskps", SSEPackedSingle>, TB, VEX; + defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, + "movmskpd", SSEPackedDouble>, TB, + OpSize, VEX; + defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256, + "movmskps", SSEPackedSingle>, TB, VEX; + defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256, + "movmskpd", SSEPackedDouble>, TB, + OpSize, VEX; + + def : Pat<(i32 (X86fgetsign FR32:$src)), + (VMOVMSKPSrr32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, + sub_ss))>; + def : Pat<(i64 (X86fgetsign FR32:$src)), + (VMOVMSKPSrr64 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, + sub_ss))>; + def : Pat<(i32 (X86fgetsign FR64:$src)), + (VMOVMSKPDrr32 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, + sub_sd))>; + def : Pat<(i64 (X86fgetsign FR64:$src)), + (VMOVMSKPDrr64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, + sub_sd))>; + + // Assembler Only + def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), + "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, TB, VEX; + def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), + "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, TB, + OpSize, VEX; + def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), + "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, TB, VEX; + def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), + "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, TB, + OpSize, VEX; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Logical Instructions +//===----------------------------------------------------------------------===// + +/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops +/// +multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, TB, VEX_4V; + + defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, TB, OpSize, VEX_4V; + + let Constraints = "$src1 = $dst" in { + defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32, + f32, f128mem, memopfsf32, SSEPackedSingle>, TB; + + defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64, + f64, f128mem, memopfsf64, SSEPackedDouble>, TB, OpSize; + } +} + +// Alias bitwise logical operations using SSE logical ops on packed FP values. +let mayLoad = 0 in { + defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand>; + defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for>; + defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor>; +} + +let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in + defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef>; + +/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops +/// +multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + // In AVX no need to add a pattern for 128-bit logical rr ps, because they + // are all promoted to v2i64, and the patterns are covered by the int + // version. This is needed in SSE only, because v2i64 isn't supported on + // SSE1, but only on SSE2. + defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, + !strconcat(OpcodeStr, "ps"), f128mem, [], + [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), + (memopv2i64 addr:$src2)))], 0>, TB, VEX_4V; + + defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, + !strconcat(OpcodeStr, "pd"), f128mem, + [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), + (bc_v2i64 (v2f64 VR128:$src2))))], + [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), + (memopv2i64 addr:$src2)))], 0>, + TB, OpSize, VEX_4V; + let Constraints = "$src1 = $dst" in { + defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, + !strconcat(OpcodeStr, "ps"), f128mem, + [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))], + [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), + (memopv2i64 addr:$src2)))]>, TB; + + defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, + !strconcat(OpcodeStr, "pd"), f128mem, + [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), + (bc_v2i64 (v2f64 VR128:$src2))))], + [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), + (memopv2i64 addr:$src2)))]>, TB, OpSize; + } +} + +/// sse12_fp_packed_logical_y - AVX 256-bit SSE 1 & 2 logical ops forms +/// +multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + defm PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, + !strconcat(OpcodeStr, "ps"), f256mem, + [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))], + [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)), + (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V; + + defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, + !strconcat(OpcodeStr, "pd"), f256mem, + [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), + (bc_v4i64 (v4f64 VR256:$src2))))], + [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), + (memopv4i64 addr:$src2)))], 0>, + TB, OpSize, VEX_4V; +} + +// AVX 256-bit packed logical ops forms +defm VAND : sse12_fp_packed_logical_y<0x54, "and", and>; +defm VOR : sse12_fp_packed_logical_y<0x56, "or", or>; +defm VXOR : sse12_fp_packed_logical_y<0x57, "xor", xor>; +defm VANDN : sse12_fp_packed_logical_y<0x55, "andn", X86andnp>; + +defm AND : sse12_fp_packed_logical<0x54, "and", and>; +defm OR : sse12_fp_packed_logical<0x56, "or", or>; +defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>; +let isCommutable = 0 in + defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>; + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Arithmetic Instructions +//===----------------------------------------------------------------------===// + +/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and +/// vector forms. +/// +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a scalar) +/// and leaves the top elements unmodified (therefore these cannot be commuted). +/// +/// These three forms can each be reg+reg or reg+mem. +/// + +/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those +/// classes below +multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + bit Is2Addr = 1> { + defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), + OpNode, FR32, f32mem, Is2Addr>, XS; + defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), + OpNode, FR64, f64mem, Is2Addr>, XD; +} + +multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, + bit Is2Addr = 1> { + let mayLoad = 0 in { + defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, + v4f32, f128mem, memopv4f32, SSEPackedSingle, Is2Addr>, TB; + defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, + v2f64, f128mem, memopv2f64, SSEPackedDouble, Is2Addr>, TB, OpSize; + } +} + +multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + let mayLoad = 0 in { + defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256, + v8f32, f256mem, memopv8f32, SSEPackedSingle, 0>, TB; + defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256, + v4f64, f256mem, memopv4f64, SSEPackedDouble, 0>, TB, OpSize; + } +} + +multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, + bit Is2Addr = 1> { + defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128, + !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32, Is2Addr>, XS; + defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128, + !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64, Is2Addr>, XD; +} + +multiclass basic_sse12_fp_binop_p_int<bits<8> opc, string OpcodeStr, + bit Is2Addr = 1> { + defm PS : sse12_fp_packed_int<opc, OpcodeStr, VR128, + !strconcat(OpcodeStr, "ps"), "sse", "_ps", f128mem, memopv4f32, + SSEPackedSingle, Is2Addr>, TB; + + defm PD : sse12_fp_packed_int<opc, OpcodeStr, VR128, + !strconcat(OpcodeStr, "pd"), "sse2", "_pd", f128mem, memopv2f64, + SSEPackedDouble, Is2Addr>, TB, OpSize; +} + +multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr> { + defm PSY : sse12_fp_packed_int<opc, OpcodeStr, VR256, + !strconcat(OpcodeStr, "ps"), "avx", "_ps_256", f256mem, memopv8f32, + SSEPackedSingle, 0>, TB; + + defm PDY : sse12_fp_packed_int<opc, OpcodeStr, VR256, + !strconcat(OpcodeStr, "pd"), "avx", "_pd_256", f256mem, memopv4f64, + SSEPackedDouble, 0>, TB, OpSize; +} + +// Binary Arithmetic instructions +defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>, + basic_sse12_fp_binop_s_int<0x58, "add", 0>, VEX_4V, VEX_LIG; +defm VADD : basic_sse12_fp_binop_p<0x58, "add", fadd, 0>, + basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V; +defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>, + basic_sse12_fp_binop_s_int<0x59, "mul", 0>, VEX_4V, VEX_LIG; +defm VMUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>, + basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V; + +let isCommutable = 0 in { + defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>, + basic_sse12_fp_binop_s_int<0x5C, "sub", 0>, VEX_4V, VEX_LIG; + defm VSUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>, + basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V; + defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>, + basic_sse12_fp_binop_s_int<0x5E, "div", 0>, VEX_4V, VEX_LIG; + defm VDIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>, + basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V; + defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>, + basic_sse12_fp_binop_s_int<0x5F, "max", 0>, VEX_4V, VEX_LIG; + defm VMAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>, + basic_sse12_fp_binop_p_int<0x5F, "max", 0>, + basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>, + basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V; + defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>, + basic_sse12_fp_binop_s_int<0x5D, "min", 0>, VEX_4V, VEX_LIG; + defm VMIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>, + basic_sse12_fp_binop_p_int<0x5D, "min", 0>, + basic_sse12_fp_binop_p_y_int<0x5D, "min">, + basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V; +} + +let Constraints = "$src1 = $dst" in { + defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd>, + basic_sse12_fp_binop_p<0x58, "add", fadd>, + basic_sse12_fp_binop_s_int<0x58, "add">; + defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul>, + basic_sse12_fp_binop_p<0x59, "mul", fmul>, + basic_sse12_fp_binop_s_int<0x59, "mul">; + + let isCommutable = 0 in { + defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub>, + basic_sse12_fp_binop_p<0x5C, "sub", fsub>, + basic_sse12_fp_binop_s_int<0x5C, "sub">; + defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv>, + basic_sse12_fp_binop_p<0x5E, "div", fdiv>, + basic_sse12_fp_binop_s_int<0x5E, "div">; + defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax>, + basic_sse12_fp_binop_p<0x5F, "max", X86fmax>, + basic_sse12_fp_binop_s_int<0x5F, "max">, + basic_sse12_fp_binop_p_int<0x5F, "max">; + defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin>, + basic_sse12_fp_binop_p<0x5D, "min", X86fmin>, + basic_sse12_fp_binop_s_int<0x5D, "min">, + basic_sse12_fp_binop_p_int<0x5D, "min">; + } +} + +/// Unop Arithmetic +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a +/// scalar) and leaves the top elements undefined. +/// +/// And, we have a special variant form for a full-vector intrinsic form. + +/// sse1_fp_unop_s - SSE1 unops in scalar form. +multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, + SDNode OpNode, Intrinsic F32Int> { + def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), + !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), + [(set FR32:$dst, (OpNode FR32:$src))]>; + // For scalar unary operations, fold a load into the operation + // only in OptForSize mode. It eliminates an instruction, but it also + // eliminates a whole-register clobber (the load), so it introduces a + // partial register update condition. + def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), + !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), + [(set FR32:$dst, (OpNode (load addr:$src)))]>, XS, + Requires<[HasSSE1, OptForSize]>; + def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (F32Int VR128:$src))]>; + def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src), + !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (F32Int sse_load_f32:$src))]>; +} + +/// sse1_fp_unop_s_avx - AVX SSE1 unops in scalar form. +multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> { + def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2), + !strconcat(OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; + let mayLoad = 1 in + def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1,f32mem:$src2), + !strconcat(OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; + def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), + (ins ssmem:$src1, VR128:$src2), + !strconcat(OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; +} + +/// sse1_fp_unop_p - SSE1 unops in packed form. +multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode> { + def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>; + def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>; +} + +/// sse1_fp_unop_p_y - AVX 256-bit SSE1 unops in packed form. +multiclass sse1_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode> { + def PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>; + def PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))]>; +} + +/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms. +multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr, + Intrinsic V4F32Int> { + def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (V4F32Int VR128:$src))]>; + def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))]>; +} + +/// sse1_fp_unop_p_y_int - AVX 256-bit intrinsics unops in packed forms. +multiclass sse1_fp_unop_p_y_int<bits<8> opc, string OpcodeStr, + Intrinsic V4F32Int> { + def PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (V4F32Int VR256:$src))]>; + def PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (V4F32Int (memopv8f32 addr:$src)))]>; +} + +/// sse2_fp_unop_s - SSE2 unops in scalar form. +multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, + SDNode OpNode, Intrinsic F64Int> { + def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src), + !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), + [(set FR64:$dst, (OpNode FR64:$src))]>; + // See the comments in sse1_fp_unop_s for why this is OptForSize. + def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src), + !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), + [(set FR64:$dst, (OpNode (load addr:$src)))]>, XD, + Requires<[HasSSE2, OptForSize]>; + def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (F64Int VR128:$src))]>; + def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src), + !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (F64Int sse_load_f64:$src))]>; +} + +/// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form. +multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> { + def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2), + !strconcat(OpcodeStr, + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; + def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1,f64mem:$src2), + !strconcat(OpcodeStr, + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; + def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, sdmem:$src2), + !strconcat(OpcodeStr, + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; +} + +/// sse2_fp_unop_p - SSE2 unops in vector forms. +multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>; + def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>; +} + +/// sse2_fp_unop_p_y - AVX SSE2 256-bit unops in vector forms. +multiclass sse2_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode> { + def PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>; + def PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))]>; +} + +/// sse2_fp_unop_p_int - SSE2 intrinsic unops in vector forms. +multiclass sse2_fp_unop_p_int<bits<8> opc, string OpcodeStr, + Intrinsic V2F64Int> { + def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (V2F64Int VR128:$src))]>; + def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))]>; +} + +/// sse2_fp_unop_p_y_int - AVX 256-bit intrinsic unops in vector forms. +multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr, + Intrinsic V2F64Int> { + def PDYr_Int : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (V2F64Int VR256:$src))]>; + def PDYm_Int : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))]>; +} + +let Predicates = [HasAVX] in { + // Square root. + defm VSQRT : sse1_fp_unop_s_avx<0x51, "vsqrt">, + sse2_fp_unop_s_avx<0x51, "vsqrt">, VEX_4V, VEX_LIG; + + defm VSQRT : sse1_fp_unop_p<0x51, "vsqrt", fsqrt>, + sse2_fp_unop_p<0x51, "vsqrt", fsqrt>, + sse1_fp_unop_p_y<0x51, "vsqrt", fsqrt>, + sse2_fp_unop_p_y<0x51, "vsqrt", fsqrt>, + sse1_fp_unop_p_int<0x51, "vsqrt", int_x86_sse_sqrt_ps>, + sse2_fp_unop_p_int<0x51, "vsqrt", int_x86_sse2_sqrt_pd>, + sse1_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_ps_256>, + sse2_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_pd_256>, + VEX; + + // Reciprocal approximations. Note that these typically require refinement + // in order to obtain suitable precision. + defm VRSQRT : sse1_fp_unop_s_avx<0x52, "vrsqrt">, VEX_4V, VEX_LIG; + defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt>, + sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt>, + sse1_fp_unop_p_y_int<0x52, "vrsqrt", int_x86_avx_rsqrt_ps_256>, + sse1_fp_unop_p_int<0x52, "vrsqrt", int_x86_sse_rsqrt_ps>, VEX; + + defm VRCP : sse1_fp_unop_s_avx<0x53, "vrcp">, VEX_4V, VEX_LIG; + defm VRCP : sse1_fp_unop_p<0x53, "vrcp", X86frcp>, + sse1_fp_unop_p_y<0x53, "vrcp", X86frcp>, + sse1_fp_unop_p_y_int<0x53, "vrcp", int_x86_avx_rcp_ps_256>, + sse1_fp_unop_p_int<0x53, "vrcp", int_x86_sse_rcp_ps>, VEX; +} + +def : Pat<(f32 (fsqrt FR32:$src)), + (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; +def : Pat<(f32 (fsqrt (load addr:$src))), + (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; +def : Pat<(f64 (fsqrt FR64:$src)), + (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>; +def : Pat<(f64 (fsqrt (load addr:$src))), + (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; + +def : Pat<(f32 (X86frsqrt FR32:$src)), + (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; +def : Pat<(f32 (X86frsqrt (load addr:$src))), + (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; + +def : Pat<(f32 (X86frcp FR32:$src)), + (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>; +def : Pat<(f32 (X86frcp (load addr:$src))), + (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX, OptForSize]>; + +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse_sqrt_ss VR128:$src), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), + (VSQRTSSr (f32 (IMPLICIT_DEF)), + (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)), + sub_ss)>; + def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src), + (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; + + def : Pat<(int_x86_sse2_sqrt_sd VR128:$src), + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), + (VSQRTSDr (f64 (IMPLICIT_DEF)), + (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd)), + sub_sd)>; + def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src), + (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>; + + def : Pat<(int_x86_sse_rsqrt_ss VR128:$src), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), + (VRSQRTSSr (f32 (IMPLICIT_DEF)), + (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)), + sub_ss)>; + def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src), + (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; + + def : Pat<(int_x86_sse_rcp_ss VR128:$src), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), + (VRCPSSr (f32 (IMPLICIT_DEF)), + (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)), + sub_ss)>; + def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src), + (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; +} + +// Square root. +defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt>, + sse1_fp_unop_p_int<0x51, "sqrt", int_x86_sse_sqrt_ps>, + sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd>, + sse2_fp_unop_p<0x51, "sqrt", fsqrt>, + sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd>; + +// Reciprocal approximations. Note that these typically require refinement +// in order to obtain suitable precision. +defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt>, + sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps>; +defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>, + sse1_fp_unop_p<0x53, "rcp", X86frcp>, + sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps>; + +// There is no f64 version of the reciprocal approximation instructions. + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Non-temporal stores +//===----------------------------------------------------------------------===// + +let AddedComplexity = 400 in { // Prefer non-temporal versions + def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), + addr:$dst)]>, VEX; + def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2f64 VR128:$src), + addr:$dst)]>, VEX; + def VMOVNTDQ_64mr : VPDI<0xE7, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2f64 VR128:$src), + addr:$dst)]>, VEX; + + let ExeDomain = SSEPackedInt in + def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), + addr:$dst)]>, VEX; + + def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>; + + def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v8f32 VR256:$src), + addr:$dst)]>, VEX; + def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f64 VR256:$src), + addr:$dst)]>, VEX; + def VMOVNTDQY_64mr : VPDI<0xE7, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f64 VR256:$src), + addr:$dst)]>, VEX; + let ExeDomain = SSEPackedInt in + def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v8f32 VR256:$src), + addr:$dst)]>, VEX; +} + +def : Pat<(int_x86_avx_movnt_dq_256 addr:$dst, VR256:$src), + (VMOVNTDQYmr addr:$dst, VR256:$src)>; +def : Pat<(int_x86_avx_movnt_pd_256 addr:$dst, VR256:$src), + (VMOVNTPDYmr addr:$dst, VR256:$src)>; +def : Pat<(int_x86_avx_movnt_ps_256 addr:$dst, VR256:$src), + (VMOVNTPSYmr addr:$dst, VR256:$src)>; + +let AddedComplexity = 400 in { // Prefer non-temporal versions +def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; +def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>; + +def MOVNTDQ_64mr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>; + +let ExeDomain = SSEPackedInt in +def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; + +def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>; + +// There is no AVX form for instructions below this point +def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "movnti{l}\t{$src, $dst|$dst, $src}", + [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, + TB, Requires<[HasSSE2]>; +def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "movnti{q}\t{$src, $dst|$dst, $src}", + [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, + TB, Requires<[HasSSE2]>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Prefetch and memory fence +//===----------------------------------------------------------------------===// + +// Prefetch intrinsic. +def PREFETCHT0 : PSI<0x18, MRM1m, (outs), (ins i8mem:$src), + "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>; +def PREFETCHT1 : PSI<0x18, MRM2m, (outs), (ins i8mem:$src), + "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>; +def PREFETCHT2 : PSI<0x18, MRM3m, (outs), (ins i8mem:$src), + "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>; +def PREFETCHNTA : PSI<0x18, MRM0m, (outs), (ins i8mem:$src), + "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>; + +// Flush cache +def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), + "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>, + TB, Requires<[HasSSE2]>; + +// Pause. This "instruction" is encoded as "rep; nop", so even though it +// was introduced with SSE2, it's backward compatible. +def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP; + +// Load, store, and memory fence +def SFENCE : I<0xAE, MRM_F8, (outs), (ins), + "sfence", [(int_x86_sse_sfence)]>, TB, Requires<[HasSSE1]>; +def LFENCE : I<0xAE, MRM_E8, (outs), (ins), + "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>; +def MFENCE : I<0xAE, MRM_F0, (outs), (ins), + "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>; + +def : Pat<(X86SFence), (SFENCE)>; +def : Pat<(X86LFence), (LFENCE)>; +def : Pat<(X86MFence), (MFENCE)>; + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Load/Store XCSR register +//===----------------------------------------------------------------------===// + +def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, VEX; +def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, VEX; + +def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src), + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>; +def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>; + +//===---------------------------------------------------------------------===// +// SSE2 - Move Aligned/Unaligned Packed Integer Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { // SSE integer instructions + +let neverHasSideEffects = 1 in { +def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +} +def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, VEX; + +// For Disassembler +let isCodeGenOnly = 1 in { +def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, VEX; +} + +let canFoldAsLoad = 1, mayLoad = 1 in { +def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +let Predicates = [HasAVX] in { + def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; + def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; +} +} + +let mayStore = 1 in { +def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i256mem:$dst, VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; +let Predicates = [HasAVX] in { +def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; +def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; +} +} + +let neverHasSideEffects = 1 in +def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>; + +def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", + []>, XS, Requires<[HasSSE2]>; + +// For Disassembler +let isCodeGenOnly = 1 in { +def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>; + +def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", + []>, XS, Requires<[HasSSE2]>; +} + +let canFoldAsLoad = 1, mayLoad = 1 in { +def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", + [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>; +def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqu\t{$src, $dst|$dst, $src}", + [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>, + XS, Requires<[HasSSE2]>; +} + +let mayStore = 1 in { +def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", + [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>; +def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", + [/*(store (v2i64 VR128:$src), addr:$dst)*/]>, + XS, Requires<[HasSSE2]>; +} + +// Intrinsic forms of MOVDQU load and store +def VMOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "vmovdqu\t{$src, $dst|$dst, $src}", + [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>, + XS, VEX, Requires<[HasAVX]>; + +def MOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", + [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>, + XS, Requires<[HasSSE2]>; + +} // ExeDomain = SSEPackedInt + +let Predicates = [HasAVX] in { + def : Pat<(int_x86_avx_loadu_dq_256 addr:$src), (VMOVDQUYrm addr:$src)>; + def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src), + (VMOVDQUYmr addr:$dst, VR256:$src)>; +} + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Arithmetic Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { // SSE integer instructions + +multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, + bit IsCommutable = 0, bit Is2Addr = 1> { + let isCommutable = IsCommutable in + def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>; + def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (IntId VR128:$src1, + (bitconvert (memopv2i64 addr:$src2))))]>; +} + +multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm, + string OpcodeStr, Intrinsic IntId, + Intrinsic IntId2, bit Is2Addr = 1> { + def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>; + def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (IntId VR128:$src1, + (bitconvert (memopv2i64 addr:$src2))))]>; + def ri : PDIi8<opc2, ImmForm, (outs VR128:$dst), + (ins VR128:$src1, i32i8imm:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (IntId2 VR128:$src1, (i32 imm:$src2)))]>; +} + +/// PDI_binop_rm - Simple SSE2 binary operator. +multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, bit IsCommutable = 0, bit Is2Addr = 1> { + let isCommutable = IsCommutable in + def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>; + def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (OpVT (OpNode VR128:$src1, + (bitconvert (memopv2i64 addr:$src2)))))]>; +} + +/// PDI_binop_rm_v2i64 - Simple SSE2 binary operator whose type is v2i64. +/// +/// FIXME: we could eliminate this and use PDI_binop_rm instead if tblgen knew +/// to collapse (bitconvert VT to VT) into its operand. +/// +multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode, + bit IsCommutable = 0, bit Is2Addr = 1> { + let isCommutable = IsCommutable in + def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]>; + def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (OpNode VR128:$src1, (memopv2i64 addr:$src2)))]>; +} + +} // ExeDomain = SSEPackedInt + +// 128-bit Integer Arithmetic + +let Predicates = [HasAVX] in { +defm VPADDB : PDI_binop_rm<0xFC, "vpaddb", add, v16i8, 1, 0 /*3addr*/>, VEX_4V; +defm VPADDW : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, 1, 0>, VEX_4V; +defm VPADDD : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, 1, 0>, VEX_4V; +defm VPADDQ : PDI_binop_rm_v2i64<0xD4, "vpaddq", add, 1, 0>, VEX_4V; +defm VPMULLW : PDI_binop_rm<0xD5, "vpmullw", mul, v8i16, 1, 0>, VEX_4V; +defm VPSUBB : PDI_binop_rm<0xF8, "vpsubb", sub, v16i8, 0, 0>, VEX_4V; +defm VPSUBW : PDI_binop_rm<0xF9, "vpsubw", sub, v8i16, 0, 0>, VEX_4V; +defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, 0, 0>, VEX_4V; +defm VPSUBQ : PDI_binop_rm_v2i64<0xFB, "vpsubq", sub, 0, 0>, VEX_4V; + +// Intrinsic forms +defm VPSUBSB : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b, 0, 0>, + VEX_4V; +defm VPSUBSW : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_sse2_psubs_w, 0, 0>, + VEX_4V; +defm VPSUBUSB : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_sse2_psubus_b, 0, 0>, + VEX_4V; +defm VPSUBUSW : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_sse2_psubus_w, 0, 0>, + VEX_4V; +defm VPADDSB : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_sse2_padds_b, 1, 0>, + VEX_4V; +defm VPADDSW : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_sse2_padds_w, 1, 0>, + VEX_4V; +defm VPADDUSB : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_sse2_paddus_b, 1, 0>, + VEX_4V; +defm VPADDUSW : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_sse2_paddus_w, 1, 0>, + VEX_4V; +defm VPMULHUW : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_sse2_pmulhu_w, 1, 0>, + VEX_4V; +defm VPMULHW : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_sse2_pmulh_w, 1, 0>, + VEX_4V; +defm VPMULUDQ : PDI_binop_rm_int<0xF4, "vpmuludq", int_x86_sse2_pmulu_dq, 1, 0>, + VEX_4V; +defm VPMADDWD : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_sse2_pmadd_wd, 1, 0>, + VEX_4V; +defm VPAVGB : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_sse2_pavg_b, 1, 0>, + VEX_4V; +defm VPAVGW : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_sse2_pavg_w, 1, 0>, + VEX_4V; +defm VPMINUB : PDI_binop_rm_int<0xDA, "vpminub", int_x86_sse2_pminu_b, 1, 0>, + VEX_4V; +defm VPMINSW : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_sse2_pmins_w, 1, 0>, + VEX_4V; +defm VPMAXUB : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_sse2_pmaxu_b, 1, 0>, + VEX_4V; +defm VPMAXSW : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_sse2_pmaxs_w, 1, 0>, + VEX_4V; +defm VPSADBW : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_sse2_psad_bw, 1, 0>, + VEX_4V; +} + +let Constraints = "$src1 = $dst" in { +defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, 1>; +defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, 1>; +defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, 1>; +defm PADDQ : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>; +defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, 1>; +defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8>; +defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16>; +defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32>; +defm PSUBQ : PDI_binop_rm_v2i64<0xFB, "psubq", sub>; + +// Intrinsic forms +defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b>; +defm PSUBSW : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w>; +defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b>; +defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w>; +defm PADDSB : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 1>; +defm PADDSW : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, 1>; +defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 1>; +defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 1>; +defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, 1>; +defm PMULHW : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w, 1>; +defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq, 1>; +defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 1>; +defm PAVGB : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 1>; +defm PAVGW : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 1>; +defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, 1>; +defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, 1>; +defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, 1>; +defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>; +defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>; + +} // Constraints = "$src1 = $dst" + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Logical Instructions +//===---------------------------------------------------------------------===// + +let Predicates = [HasAVX] in { +defm VPSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "vpsllw", + int_x86_sse2_psll_w, int_x86_sse2_pslli_w, 0>, + VEX_4V; +defm VPSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "vpslld", + int_x86_sse2_psll_d, int_x86_sse2_pslli_d, 0>, + VEX_4V; +defm VPSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "vpsllq", + int_x86_sse2_psll_q, int_x86_sse2_pslli_q, 0>, + VEX_4V; + +defm VPSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "vpsrlw", + int_x86_sse2_psrl_w, int_x86_sse2_psrli_w, 0>, + VEX_4V; +defm VPSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "vpsrld", + int_x86_sse2_psrl_d, int_x86_sse2_psrli_d, 0>, + VEX_4V; +defm VPSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "vpsrlq", + int_x86_sse2_psrl_q, int_x86_sse2_psrli_q, 0>, + VEX_4V; + +defm VPSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "vpsraw", + int_x86_sse2_psra_w, int_x86_sse2_psrai_w, 0>, + VEX_4V; +defm VPSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "vpsrad", + int_x86_sse2_psra_d, int_x86_sse2_psrai_d, 0>, + VEX_4V; + +defm VPAND : PDI_binop_rm_v2i64<0xDB, "vpand", and, 1, 0>, VEX_4V; +defm VPOR : PDI_binop_rm_v2i64<0xEB, "vpor" , or, 1, 0>, VEX_4V; +defm VPXOR : PDI_binop_rm_v2i64<0xEF, "vpxor", xor, 1, 0>, VEX_4V; + +let ExeDomain = SSEPackedInt in { + let neverHasSideEffects = 1 in { + // 128-bit logical shifts. + def VPSLLDQri : PDIi8<0x73, MRM7r, + (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + VEX_4V; + def VPSRLDQri : PDIi8<0x73, MRM3r, + (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + VEX_4V; + // PSRADQri doesn't exist in SSE[1-3]. + } + def VPANDNrr : PDI<0xDF, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (v2i64 (X86andnp VR128:$src1, VR128:$src2)))]>,VEX_4V; + + def VPANDNrm : PDI<0xDF, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (X86andnp VR128:$src1, + (memopv2i64 addr:$src2)))]>, VEX_4V; +} +} + +let Constraints = "$src1 = $dst" in { +defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", + int_x86_sse2_psll_w, int_x86_sse2_pslli_w>; +defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", + int_x86_sse2_psll_d, int_x86_sse2_pslli_d>; +defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", + int_x86_sse2_psll_q, int_x86_sse2_pslli_q>; + +defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", + int_x86_sse2_psrl_w, int_x86_sse2_psrli_w>; +defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", + int_x86_sse2_psrl_d, int_x86_sse2_psrli_d>; +defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", + int_x86_sse2_psrl_q, int_x86_sse2_psrli_q>; + +defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", + int_x86_sse2_psra_w, int_x86_sse2_psrai_w>; +defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", + int_x86_sse2_psra_d, int_x86_sse2_psrai_d>; + +defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>; +defm POR : PDI_binop_rm_v2i64<0xEB, "por" , or, 1>; +defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>; + +let ExeDomain = SSEPackedInt in { + let neverHasSideEffects = 1 in { + // 128-bit logical shifts. + def PSLLDQri : PDIi8<0x73, MRM7r, + (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + "pslldq\t{$src2, $dst|$dst, $src2}", []>; + def PSRLDQri : PDIi8<0x73, MRM3r, + (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2), + "psrldq\t{$src2, $dst|$dst, $src2}", []>; + // PSRADQri doesn't exist in SSE[1-3]. + } + def PANDNrr : PDI<0xDF, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "pandn\t{$src2, $dst|$dst, $src2}", []>; + + def PANDNrm : PDI<0xDF, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + "pandn\t{$src2, $dst|$dst, $src2}", []>; +} +} // Constraints = "$src1 = $dst" + +let Predicates = [HasAVX] in { + def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), + (v2i64 (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2)))>; + def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), + (v2i64 (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>; + def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2), + (v2i64 (VPSLLDQri VR128:$src1, imm:$src2))>; + def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2), + (v2i64 (VPSRLDQri VR128:$src1, imm:$src2))>; + def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), + (v2f64 (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>; + + // Shift up / down and insert zero's. + def : Pat<(v2i64 (X86vshl VR128:$src, (i8 imm:$amt))), + (v2i64 (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt)))>; + def : Pat<(v2i64 (X86vshr VR128:$src, (i8 imm:$amt))), + (v2i64 (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt)))>; +} + +let Predicates = [HasSSE2] in { + def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2), + (v2i64 (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2)))>; + def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2), + (v2i64 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>; + def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2), + (v2i64 (PSLLDQri VR128:$src1, imm:$src2))>; + def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2), + (v2i64 (PSRLDQri VR128:$src1, imm:$src2))>; + def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)), + (v2f64 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>; + + // Shift up / down and insert zero's. + def : Pat<(v2i64 (X86vshl VR128:$src, (i8 imm:$amt))), + (v2i64 (PSLLDQri VR128:$src, (BYTE_imm imm:$amt)))>; + def : Pat<(v2i64 (X86vshr VR128:$src, (i8 imm:$amt))), + (v2i64 (PSRLDQri VR128:$src, (BYTE_imm imm:$amt)))>; +} + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Comparison Instructions +//===---------------------------------------------------------------------===// + +let Predicates = [HasAVX] in { + defm VPCMPEQB : PDI_binop_rm_int<0x74, "vpcmpeqb", int_x86_sse2_pcmpeq_b, 1, + 0>, VEX_4V; + defm VPCMPEQW : PDI_binop_rm_int<0x75, "vpcmpeqw", int_x86_sse2_pcmpeq_w, 1, + 0>, VEX_4V; + defm VPCMPEQD : PDI_binop_rm_int<0x76, "vpcmpeqd", int_x86_sse2_pcmpeq_d, 1, + 0>, VEX_4V; + defm VPCMPGTB : PDI_binop_rm_int<0x64, "vpcmpgtb", int_x86_sse2_pcmpgt_b, 0, + 0>, VEX_4V; + defm VPCMPGTW : PDI_binop_rm_int<0x65, "vpcmpgtw", int_x86_sse2_pcmpgt_w, 0, + 0>, VEX_4V; + defm VPCMPGTD : PDI_binop_rm_int<0x66, "vpcmpgtd", int_x86_sse2_pcmpgt_d, 0, + 0>, VEX_4V; + + def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)), + (VPCMPEQBrr VR128:$src1, VR128:$src2)>; + def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))), + (VPCMPEQBrm VR128:$src1, addr:$src2)>; + def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)), + (VPCMPEQWrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))), + (VPCMPEQWrm VR128:$src1, addr:$src2)>; + def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)), + (VPCMPEQDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))), + (VPCMPEQDrm VR128:$src1, addr:$src2)>; + + def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)), + (VPCMPGTBrr VR128:$src1, VR128:$src2)>; + def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))), + (VPCMPGTBrm VR128:$src1, addr:$src2)>; + def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)), + (VPCMPGTWrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))), + (VPCMPGTWrm VR128:$src1, addr:$src2)>; + def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)), + (VPCMPGTDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))), + (VPCMPGTDrm VR128:$src1, addr:$src2)>; +} + +let Constraints = "$src1 = $dst" in { + defm PCMPEQB : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b, 1>; + defm PCMPEQW : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w, 1>; + defm PCMPEQD : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d, 1>; + defm PCMPGTB : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>; + defm PCMPGTW : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>; + defm PCMPGTD : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>; +} // Constraints = "$src1 = $dst" + +let Predicates = [HasSSE2] in { + def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)), + (PCMPEQBrr VR128:$src1, VR128:$src2)>; + def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))), + (PCMPEQBrm VR128:$src1, addr:$src2)>; + def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)), + (PCMPEQWrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))), + (PCMPEQWrm VR128:$src1, addr:$src2)>; + def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)), + (PCMPEQDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))), + (PCMPEQDrm VR128:$src1, addr:$src2)>; + + def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)), + (PCMPGTBrr VR128:$src1, VR128:$src2)>; + def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))), + (PCMPGTBrm VR128:$src1, addr:$src2)>; + def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)), + (PCMPGTWrr VR128:$src1, VR128:$src2)>; + def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))), + (PCMPGTWrm VR128:$src1, addr:$src2)>; + def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)), + (PCMPGTDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))), + (PCMPGTDrm VR128:$src1, addr:$src2)>; +} + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Pack Instructions +//===---------------------------------------------------------------------===// + +let Predicates = [HasAVX] in { +defm VPACKSSWB : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_sse2_packsswb_128, + 0, 0>, VEX_4V; +defm VPACKSSDW : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_sse2_packssdw_128, + 0, 0>, VEX_4V; +defm VPACKUSWB : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_sse2_packuswb_128, + 0, 0>, VEX_4V; +} + +let Constraints = "$src1 = $dst" in { +defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>; +defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>; +defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>; +} // Constraints = "$src1 = $dst" + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Shuffle Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_pshuffle<string OpcodeStr, ValueType vt, PatFrag pshuf_frag, + PatFrag bc_frag> { +def ri : Ii8<0x70, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (vt (pshuf_frag:$src2 VR128:$src1, + (undef))))]>; +def mi : Ii8<0x70, MRMSrcMem, + (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (vt (pshuf_frag:$src2 + (bc_frag (memopv2i64 addr:$src1)), + (undef))))]>; +} +} // ExeDomain = SSEPackedInt + +let Predicates = [HasAVX] in { + let AddedComplexity = 5 in + defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, TB, OpSize, + VEX; + + // SSE2 with ImmT == Imm8 and XS prefix. + defm VPSHUFHW : sse2_pshuffle<"vpshufhw", v8i16, pshufhw, bc_v8i16>, XS, + VEX; + + // SSE2 with ImmT == Imm8 and XD prefix. + defm VPSHUFLW : sse2_pshuffle<"vpshuflw", v8i16, pshuflw, bc_v8i16>, XD, + VEX; + + let AddedComplexity = 5 in + def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))), + (VPSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>; + // Unary v4f32 shuffle with VPSHUF* in order to fold a load. + def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)), + (VPSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>; + + def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)), + (i8 imm:$imm))), + (VPSHUFDmi addr:$src1, imm:$imm)>; + def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)), + (i8 imm:$imm))), + (VPSHUFDmi addr:$src1, imm:$imm)>; + def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), + (VPSHUFDri VR128:$src1, imm:$imm)>; + def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))), + (VPSHUFDri VR128:$src1, imm:$imm)>; + def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))), + (VPSHUFHWri VR128:$src, imm:$imm)>; + def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)), + (i8 imm:$imm))), + (VPSHUFHWmi addr:$src, imm:$imm)>; + def : Pat<(v8i16 (X86PShuflw VR128:$src, (i8 imm:$imm))), + (VPSHUFLWri VR128:$src, imm:$imm)>; + def : Pat<(v8i16 (X86PShuflw (bc_v8i16 (memopv2i64 addr:$src)), + (i8 imm:$imm))), + (VPSHUFLWmi addr:$src, imm:$imm)>; +} + +let Predicates = [HasSSE2] in { + let AddedComplexity = 5 in + defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, pshufd, bc_v4i32>, TB, OpSize; + + // SSE2 with ImmT == Imm8 and XS prefix. + defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, pshufhw, bc_v8i16>, XS; + + // SSE2 with ImmT == Imm8 and XD prefix. + defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, pshuflw, bc_v8i16>, XD; + + let AddedComplexity = 5 in + def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))), + (PSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>; + // Unary v4f32 shuffle with PSHUF* in order to fold a load. + def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)), + (PSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>; + + def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)), + (i8 imm:$imm))), + (PSHUFDmi addr:$src1, imm:$imm)>; + def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)), + (i8 imm:$imm))), + (PSHUFDmi addr:$src1, imm:$imm)>; + def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))), + (PSHUFDri VR128:$src1, imm:$imm)>; + def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))), + (PSHUFDri VR128:$src1, imm:$imm)>; + def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))), + (PSHUFHWri VR128:$src, imm:$imm)>; + def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)), + (i8 imm:$imm))), + (PSHUFHWmi addr:$src, imm:$imm)>; + def : Pat<(v8i16 (X86PShuflw VR128:$src, (i8 imm:$imm))), + (PSHUFLWri VR128:$src, imm:$imm)>; + def : Pat<(v8i16 (X86PShuflw (bc_v8i16 (memopv2i64 addr:$src)), + (i8 imm:$imm))), + (PSHUFLWmi addr:$src, imm:$imm)>; +} + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Unpack Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, + SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> { + def rr : PDI<opc, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))]>; + def rm : PDI<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (OpNode VR128:$src1, + (bc_frag (memopv2i64 + addr:$src2))))]>; +} + +let Predicates = [HasAVX] in { + defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Punpcklbw, + bc_v16i8, 0>, VEX_4V; + defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Punpcklwd, + bc_v8i16, 0>, VEX_4V; + defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Punpckldq, + bc_v4i32, 0>, VEX_4V; + + /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen + /// knew to collapse (bitconvert VT to VT) into its operand. + def VPUNPCKLQDQrr : PDI<0x6C, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (v2i64 (X86Punpcklqdq VR128:$src1, + VR128:$src2)))]>, VEX_4V; + def VPUNPCKLQDQrm : PDI<0x6C, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (v2i64 (X86Punpcklqdq VR128:$src1, + (memopv2i64 addr:$src2))))]>, VEX_4V; + + defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Punpckhbw, + bc_v16i8, 0>, VEX_4V; + defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Punpckhwd, + bc_v8i16, 0>, VEX_4V; + defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Punpckhdq, + bc_v4i32, 0>, VEX_4V; + + /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen + /// knew to collapse (bitconvert VT to VT) into its operand. + def VPUNPCKHQDQrr : PDI<0x6D, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (v2i64 (X86Punpckhqdq VR128:$src1, + VR128:$src2)))]>, VEX_4V; + def VPUNPCKHQDQrm : PDI<0x6D, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (v2i64 (X86Punpckhqdq VR128:$src1, + (memopv2i64 addr:$src2))))]>, VEX_4V; +} + +let Constraints = "$src1 = $dst" in { + defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Punpcklbw, bc_v16i8>; + defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Punpcklwd, bc_v8i16>; + defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Punpckldq, bc_v4i32>; + + /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen + /// knew to collapse (bitconvert VT to VT) into its operand. + def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "punpcklqdq\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2i64 (X86Punpcklqdq VR128:$src1, VR128:$src2)))]>; + def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + "punpcklqdq\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2i64 (X86Punpcklqdq VR128:$src1, + (memopv2i64 addr:$src2))))]>; + + defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Punpckhbw, bc_v16i8>; + defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Punpckhwd, bc_v8i16>; + defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Punpckhdq, bc_v4i32>; + + /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen + /// knew to collapse (bitconvert VT to VT) into its operand. + def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "punpckhqdq\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2i64 (X86Punpckhqdq VR128:$src1, VR128:$src2)))]>; + def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + "punpckhqdq\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v2i64 (X86Punpckhqdq VR128:$src1, + (memopv2i64 addr:$src2))))]>; +} +} // ExeDomain = SSEPackedInt + +// Splat v2f64 / v2i64 +let AddedComplexity = 10 in { + def : Pat<(splat_lo (v2i64 VR128:$src), (undef)), + (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>; + def : Pat<(splat_lo (v2i64 VR128:$src), (undef)), + (VPUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasAVX]>; +} + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Extract and Insert +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_pinsrw<bit Is2Addr = 1> { + def rri : Ii8<0xC4, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, + GR32:$src2, i32i8imm:$src3), + !if(Is2Addr, + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))]>; + def rmi : Ii8<0xC4, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, + i16mem:$src2, i32i8imm:$src3), + !if(Is2Addr, + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), + imm:$src3))]>; +} + +// Extract +let Predicates = [HasAVX] in +def VPEXTRWri : Ii8<0xC5, MRMSrcReg, + (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), + "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1), + imm:$src2))]>, TB, OpSize, VEX; +def PEXTRWri : PDIi8<0xC5, MRMSrcReg, + (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), + "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1), + imm:$src2))]>; + +// Insert +let Predicates = [HasAVX] in { + defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V; + def VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR64:$src2, i32i8imm:$src3), + "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, TB, OpSize, VEX_4V; +} + +let Constraints = "$src1 = $dst" in + defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[HasSSE2]>; + +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Mask Creation +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { + +def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>, VEX; +def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX; +def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>; + +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Conditional Store +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { + +let Uses = [EDI] in +def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), + (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, VEX; +let Uses = [RDI] in +def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), + (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, VEX; + +let Uses = [EDI] in +def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>; +let Uses = [RDI] in +def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; + +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Move Doubleword +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// +// Move Int Doubleword to Packed Double Int +// +def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector GR32:$src)))]>, VEX; +def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, + VEX; +def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector GR64:$src)))]>, VEX; +def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert GR64:$src))]>, VEX; + +def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector GR32:$src)))]>; +def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>; +def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector GR64:$src)))]>; +def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert GR64:$src))]>; + +//===---------------------------------------------------------------------===// +// Move Int Doubleword to Single Scalar +// +def VMOVDI2SSrr : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert GR32:$src))]>, VEX; + +def VMOVDI2SSrm : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>, + VEX; +def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert GR32:$src))]>; + +def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>; + +//===---------------------------------------------------------------------===// +// Move Packed Doubleword Int to Packed Double Int +// +def VMOVPDI2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), + (iPTR 0)))]>, VEX; +def VMOVPDI2DImr : VPDI<0x7E, MRMDestMem, (outs), + (ins i32mem:$dst, VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (vector_extract (v4i32 VR128:$src), + (iPTR 0))), addr:$dst)]>, VEX; +def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), + (iPTR 0)))]>; +def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (vector_extract (v4i32 VR128:$src), + (iPTR 0))), addr:$dst)]>; + +//===---------------------------------------------------------------------===// +// Move Packed Doubleword Int first element to Doubleword Int +// +def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), + (iPTR 0)))]>, + TB, OpSize, VEX, VEX_W, Requires<[HasAVX, In64BitMode]>; + +def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), + (iPTR 0)))]>; + +//===---------------------------------------------------------------------===// +// Bitcast FR64 <-> GR64 +// +let Predicates = [HasAVX] in +def VMOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, + VEX; +def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bitconvert FR64:$src))]>; +def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>; + +def MOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>; +def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bitconvert FR64:$src))]>; +def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>; + +//===---------------------------------------------------------------------===// +// Move Scalar Single to Double Int +// +def VMOVSS2DIrr : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bitconvert FR32:$src))]>, VEX; +def VMOVSS2DImr : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, VEX; +def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bitconvert FR32:$src))]>; +def MOVSS2DImr : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>; + +//===---------------------------------------------------------------------===// +// Patterns and instructions to describe movd/movq to XMM register zero-extends +// +let AddedComplexity = 15 in { +def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector GR32:$src)))))]>, + VEX; +def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only + [(set VR128:$dst, (v2i64 (X86vzmovl + (v2i64 (scalar_to_vector GR64:$src)))))]>, + VEX, VEX_W; +} +let AddedComplexity = 15 in { +def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector GR32:$src)))))]>; +def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only + [(set VR128:$dst, (v2i64 (X86vzmovl + (v2i64 (scalar_to_vector GR64:$src)))))]>; +} + +let AddedComplexity = 20 in { +def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86vzmovl (v4i32 (scalar_to_vector + (loadi32 addr:$src))))))]>, + VEX; +def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86vzmovl (v4i32 (scalar_to_vector + (loadi32 addr:$src))))))]>; +} + +let Predicates = [HasSSE2], AddedComplexity = 20 in { + def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), + (MOVZDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), + (MOVZDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (MOVZDI2PDIrm addr:$src)>; +} + +let Predicates = [HasAVX] in { + // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. + let AddedComplexity = 20 in { + def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))), + (VMOVZDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), + (VMOVZDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (VMOVZDI2PDIrm addr:$src)>; + } + // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector GR32:$src)),(i32 0)))), + (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(i32 0)))), + (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>; +} + +// These are the correct encodings of the instructions so that we know how to +// read correct assembly, even though we continue to emit the wrong ones for +// compatibility with Darwin's buggy assembler. +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", + (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", + (MOV64toSDrr FR64:$dst, GR64:$src), 0>; +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", + (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", + (MOVSDto64rr GR64:$dst, FR64:$src), 0>; +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", + (VMOVZQI2PQIrr VR128:$dst, GR64:$src), 0>; +def : InstAlias<"movq\t{$src, $dst|$dst, $src}", + (MOVZQI2PQIrr VR128:$dst, GR64:$src), 0>; + +//===---------------------------------------------------------------------===// +// SSE2 - Move Quadword +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// +// Move Quadword Int to Packed Quadword Int +// +def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, + VEX, Requires<[HasAVX]>; +def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, + Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix + +//===---------------------------------------------------------------------===// +// Move Packed Quadword Int to Quadword Int +// +def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))), addr:$dst)]>, VEX; +def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (vector_extract (v2i64 VR128:$src), + (iPTR 0))), addr:$dst)]>; + +//===---------------------------------------------------------------------===// +// Store / copy lower 64-bits of a XMM register. +// +def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX; +def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>; + +let AddedComplexity = 20 in +def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (X86vzmovl (v2i64 (scalar_to_vector + (loadi64 addr:$src))))))]>, + XS, VEX, Requires<[HasAVX]>; + +let AddedComplexity = 20 in +def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (X86vzmovl (v2i64 (scalar_to_vector + (loadi64 addr:$src))))))]>, + XS, Requires<[HasSSE2]>; + +let Predicates = [HasSSE2], AddedComplexity = 20 in { + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (MOVZQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), + (MOVZQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>; +} + +let Predicates = [HasAVX], AddedComplexity = 20 in { + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (VMOVZQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))), + (VMOVZQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzload addr:$src)), + (VMOVZQI2PQIrm addr:$src)>; +} + +//===---------------------------------------------------------------------===// +// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in +// IA32 document. movq xmm1, xmm2 does clear the high bits. +// +let AddedComplexity = 15 in +def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, + XS, VEX, Requires<[HasAVX]>; +let AddedComplexity = 15 in +def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, + XS, Requires<[HasSSE2]>; + +let AddedComplexity = 20 in +def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2i64 (X86vzmovl + (loadv2i64 addr:$src))))]>, + XS, VEX, Requires<[HasAVX]>; +let AddedComplexity = 20 in { +def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2i64 (X86vzmovl + (loadv2i64 addr:$src))))]>, + XS, Requires<[HasSSE2]>; +} + +let AddedComplexity = 20 in { + let Predicates = [HasSSE2] in { + def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))), + (MOVZPQILo2PQIrm addr:$src)>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (MOVZPQILo2PQIrr VR128:$src)>; + } + let Predicates = [HasAVX] in { + def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))), + (VMOVZPQILo2PQIrm addr:$src)>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (VMOVZPQILo2PQIrr VR128:$src)>; + } +} + +// Instructions to match in the assembler +def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W; +def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W; +// Recognize "movd" with GR64 destination, but encode as a "movq" +def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), + "movd\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W; + +// Instructions for the disassembler +// xr = XMM register +// xm = mem64 + +let Predicates = [HasAVX] in +def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS; +def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", []>, XS; + +//===---------------------------------------------------------------------===// +// SSE3 - Conversion Instructions +//===---------------------------------------------------------------------===// + +// Convert Packed Double FP to Packed DW Integers +let Predicates = [HasAVX] in { +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. +def VCVTPD2DQrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPD2DQXrYr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX; + +// XMM only +def VCVTPD2DQXrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPD2DQXrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX; + +// YMM only +def VCVTPD2DQYrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "vcvtpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPD2DQYrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "vcvtpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; +} + +def CVTPD2DQrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2dq\t{$src, $dst|$dst, $src}", []>; +def CVTPD2DQrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2dq\t{$src, $dst|$dst, $src}", []>; + +def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), + (VCVTPD2DQYrr VR256:$src)>; +def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))), + (VCVTPD2DQYrm addr:$src)>; + +// Convert Packed DW Integers to Packed Double FP +let Predicates = [HasAVX] in { +def VCVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTDQ2PDYrm : S3SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTDQ2PDYrr : S3SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; +} + +def CVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtdq2pd\t{$src, $dst|$dst, $src}", []>; +def CVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtdq2pd\t{$src, $dst|$dst, $src}", []>; + +// AVX 256-bit register conversion intrinsics +def : Pat<(int_x86_avx_cvtdq2_pd_256 VR128:$src), + (VCVTDQ2PDYrr VR128:$src)>; +def : Pat<(int_x86_avx_cvtdq2_pd_256 (memopv4i32 addr:$src)), + (VCVTDQ2PDYrm addr:$src)>; + +def : Pat<(int_x86_avx_cvt_pd2dq_256 VR256:$src), + (VCVTPD2DQYrr VR256:$src)>; +def : Pat<(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)), + (VCVTPD2DQYrm addr:$src)>; + +def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))), + (VCVTDQ2PDYrr VR128:$src)>; +def : Pat<(v4f64 (sint_to_fp (memopv4i32 addr:$src))), + (VCVTDQ2PDYrm addr:$src)>; + +//===---------------------------------------------------------------------===// +// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP +//===---------------------------------------------------------------------===// +multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, + ValueType vt, RegisterClass RC, PatFrag mem_frag, + X86MemOperand x86memop> { +def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (vt (OpNode RC:$src)))]>; +def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>; +} + +let Predicates = [HasAVX] in { + defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", + v4f32, VR128, memopv4f32, f128mem>, VEX; + defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", + v4f32, VR128, memopv4f32, f128mem>, VEX; + defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", + v8f32, VR256, memopv8f32, f256mem>, VEX; + defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", + v8f32, VR256, memopv8f32, f256mem>, VEX; +} +defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, + memopv4f32, f128mem>; +defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, + memopv4f32, f128mem>; + +let Predicates = [HasSSE3] in { + def : Pat<(v4i32 (X86Movshdup VR128:$src)), + (MOVSHDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), + (MOVSHDUPrm addr:$src)>; + def : Pat<(v4i32 (X86Movsldup VR128:$src)), + (MOVSLDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))), + (MOVSLDUPrm addr:$src)>; +} + +let Predicates = [HasAVX] in { + def : Pat<(v4i32 (X86Movshdup VR128:$src)), + (VMOVSHDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), + (VMOVSHDUPrm addr:$src)>; + def : Pat<(v4i32 (X86Movsldup VR128:$src)), + (VMOVSLDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))), + (VMOVSLDUPrm addr:$src)>; + def : Pat<(v8i32 (X86Movshdup VR256:$src)), + (VMOVSHDUPYrr VR256:$src)>; + def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (memopv4i64 addr:$src)))), + (VMOVSHDUPYrm addr:$src)>; + def : Pat<(v8i32 (X86Movsldup VR256:$src)), + (VMOVSLDUPYrr VR256:$src)>; + def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (memopv4i64 addr:$src)))), + (VMOVSLDUPYrm addr:$src)>; +} + +//===---------------------------------------------------------------------===// +// SSE3 - Replicate Double FP - MOVDDUP +//===---------------------------------------------------------------------===// + +multiclass sse3_replicate_dfp<string OpcodeStr> { +def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst,(v2f64 (movddup VR128:$src, (undef))))]>; +def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (v2f64 (movddup (scalar_to_vector (loadf64 addr:$src)), + (undef))))]>; +} + +// FIXME: Merge with above classe when there're patterns for the ymm version +multiclass sse3_replicate_dfp_y<string OpcodeStr> { +let Predicates = [HasAVX] in { + def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>; + def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>; + } +} + +defm MOVDDUP : sse3_replicate_dfp<"movddup">; +defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; +defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX; + +let Predicates = [HasSSE3] in { + def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))), + (undef)), + (MOVDDUPrm addr:$src)>; + let AddedComplexity = 5 in { + def : Pat<(movddup (memopv2f64 addr:$src), (undef)), (MOVDDUPrm addr:$src)>; + def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)), + (MOVDDUPrm addr:$src)>; + def : Pat<(movddup (memopv2i64 addr:$src), (undef)), (MOVDDUPrm addr:$src)>; + def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)), + (MOVDDUPrm addr:$src)>; + } + def : Pat<(X86Movddup (memopv2f64 addr:$src)), + (MOVDDUPrm addr:$src)>; + def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), + (MOVDDUPrm addr:$src)>; + def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))), + (MOVDDUPrm addr:$src)>; + def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))), + (MOVDDUPrm addr:$src)>; + def : Pat<(X86Movddup (bc_v2f64 + (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (MOVDDUPrm addr:$src)>; +} + +let Predicates = [HasAVX] in { + def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))), + (undef)), + (VMOVDDUPrm addr:$src)>; + let AddedComplexity = 5 in { + def : Pat<(movddup (memopv2f64 addr:$src), (undef)), (VMOVDDUPrm addr:$src)>; + def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)), + (VMOVDDUPrm addr:$src)>; + def : Pat<(movddup (memopv2i64 addr:$src), (undef)), (VMOVDDUPrm addr:$src)>; + def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)), + (VMOVDDUPrm addr:$src)>; + } + def : Pat<(X86Movddup (memopv2f64 addr:$src)), + (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))), + (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))), + (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))), + (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + def : Pat<(X86Movddup (bc_v2f64 + (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + + // 256-bit version + def : Pat<(X86Movddup (memopv4f64 addr:$src)), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (memopv4i64 addr:$src)), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4f64 (scalar_to_vector (loadf64 addr:$src)))), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4f64 VR256:$src)), + (VMOVDDUPYrr VR256:$src)>; + def : Pat<(X86Movddup (v4i64 VR256:$src)), + (VMOVDDUPYrr VR256:$src)>; +} + +//===---------------------------------------------------------------------===// +// SSE3 - Move Unaligned Integer +//===---------------------------------------------------------------------===// + +let Predicates = [HasAVX] in { + def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vlddqu\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX; + def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vlddqu\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, VEX; +} +def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "lddqu\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>; + +//===---------------------------------------------------------------------===// +// SSE3 - Arithmetic +//===---------------------------------------------------------------------===// + +multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, bit Is2Addr = 1> { + def rr : I<0xD0, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (Int RC:$src1, RC:$src2))]>; + def rm : I<0xD0, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))]>; +} + +let Predicates = [HasAVX], + ExeDomain = SSEPackedDouble in { + defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, + f128mem, 0>, TB, XD, VEX_4V; + defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128, + f128mem, 0>, TB, OpSize, VEX_4V; + defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256, + f256mem, 0>, TB, XD, VEX_4V; + defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256, + f256mem, 0>, TB, OpSize, VEX_4V; +} +let Constraints = "$src1 = $dst", Predicates = [HasSSE3], + ExeDomain = SSEPackedDouble in { + defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128, + f128mem>, TB, XD; + defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128, + f128mem>, TB, OpSize; +} + +//===---------------------------------------------------------------------===// +// SSE3 Instructions +//===---------------------------------------------------------------------===// + +// Horizontal ops +multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, + X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> { + def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>; + + def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>; +} +multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, + X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> { + def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>; + + def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>; +} + +let Predicates = [HasAVX] in { + defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, + X86fhadd, 0>, VEX_4V; + defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, + X86fhadd, 0>, VEX_4V; + defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, + X86fhsub, 0>, VEX_4V; + defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem, + X86fhsub, 0>, VEX_4V; + defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, + X86fhadd, 0>, VEX_4V; + defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem, + X86fhadd, 0>, VEX_4V; + defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, + X86fhsub, 0>, VEX_4V; + defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem, + X86fhsub, 0>, VEX_4V; +} + +let Constraints = "$src1 = $dst" in { + defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>; + defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>; + defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>; + defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>; +} + +//===---------------------------------------------------------------------===// +// SSSE3 - Packed Absolute Instructions +//===---------------------------------------------------------------------===// + + +/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. +multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr, + PatFrag mem_frag128, Intrinsic IntId128> { + def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId128 VR128:$src))]>, + OpSize; + + def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (IntId128 + (bitconvert (mem_frag128 addr:$src))))]>, OpSize; +} + +let Predicates = [HasAVX] in { + defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", memopv16i8, + int_x86_ssse3_pabs_b_128>, VEX; + defm VPABSW : SS3I_unop_rm_int<0x1D, "vpabsw", memopv8i16, + int_x86_ssse3_pabs_w_128>, VEX; + defm VPABSD : SS3I_unop_rm_int<0x1E, "vpabsd", memopv4i32, + int_x86_ssse3_pabs_d_128>, VEX; +} + +defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", memopv16i8, + int_x86_ssse3_pabs_b_128>; +defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", memopv8i16, + int_x86_ssse3_pabs_w_128>; +defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd", memopv4i32, + int_x86_ssse3_pabs_d_128>; + +//===---------------------------------------------------------------------===// +// SSSE3 - Packed Binary Operator Instructions +//===---------------------------------------------------------------------===// + +/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. +multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, + PatFrag mem_frag128, Intrinsic IntId128, + bit Is2Addr = 1> { + let isCommutable = 1 in + def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, + OpSize; + def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (IntId128 VR128:$src1, + (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; +} + +let ImmT = NoImm, Predicates = [HasAVX] in { +let isCommutable = 0 in { + defm VPHADDW : SS3I_binop_rm_int<0x01, "vphaddw", memopv8i16, + int_x86_ssse3_phadd_w_128, 0>, VEX_4V; + defm VPHADDD : SS3I_binop_rm_int<0x02, "vphaddd", memopv4i32, + int_x86_ssse3_phadd_d_128, 0>, VEX_4V; + defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", memopv8i16, + int_x86_ssse3_phadd_sw_128, 0>, VEX_4V; + defm VPHSUBW : SS3I_binop_rm_int<0x05, "vphsubw", memopv8i16, + int_x86_ssse3_phsub_w_128, 0>, VEX_4V; + defm VPHSUBD : SS3I_binop_rm_int<0x06, "vphsubd", memopv4i32, + int_x86_ssse3_phsub_d_128, 0>, VEX_4V; + defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", memopv8i16, + int_x86_ssse3_phsub_sw_128, 0>, VEX_4V; + defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw", memopv16i8, + int_x86_ssse3_pmadd_ub_sw_128, 0>, VEX_4V; + defm VPSHUFB : SS3I_binop_rm_int<0x00, "vpshufb", memopv16i8, + int_x86_ssse3_pshuf_b_128, 0>, VEX_4V; + defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", memopv16i8, + int_x86_ssse3_psign_b_128, 0>, VEX_4V; + defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", memopv8i16, + int_x86_ssse3_psign_w_128, 0>, VEX_4V; + defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", memopv4i32, + int_x86_ssse3_psign_d_128, 0>, VEX_4V; +} +defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw", memopv8i16, + int_x86_ssse3_pmul_hr_sw_128, 0>, VEX_4V; +} + +// None of these have i8 immediate fields. +let ImmT = NoImm, Constraints = "$src1 = $dst" in { +let isCommutable = 0 in { + defm PHADDW : SS3I_binop_rm_int<0x01, "phaddw", memopv8i16, + int_x86_ssse3_phadd_w_128>; + defm PHADDD : SS3I_binop_rm_int<0x02, "phaddd", memopv4i32, + int_x86_ssse3_phadd_d_128>; + defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", memopv8i16, + int_x86_ssse3_phadd_sw_128>; + defm PHSUBW : SS3I_binop_rm_int<0x05, "phsubw", memopv8i16, + int_x86_ssse3_phsub_w_128>; + defm PHSUBD : SS3I_binop_rm_int<0x06, "phsubd", memopv4i32, + int_x86_ssse3_phsub_d_128>; + defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", memopv8i16, + int_x86_ssse3_phsub_sw_128>; + defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw", memopv16i8, + int_x86_ssse3_pmadd_ub_sw_128>; + defm PSHUFB : SS3I_binop_rm_int<0x00, "pshufb", memopv16i8, + int_x86_ssse3_pshuf_b_128>; + defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", memopv16i8, + int_x86_ssse3_psign_b_128>; + defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", memopv8i16, + int_x86_ssse3_psign_w_128>; + defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", memopv4i32, + int_x86_ssse3_psign_d_128>; +} +defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw", memopv8i16, + int_x86_ssse3_pmul_hr_sw_128>; +} + +let Predicates = [HasSSSE3] in { + def : Pat<(X86pshufb VR128:$src, VR128:$mask), + (PSHUFBrr128 VR128:$src, VR128:$mask)>; + def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))), + (PSHUFBrm128 VR128:$src, addr:$mask)>; + + def : Pat<(X86psignb VR128:$src1, VR128:$src2), + (PSIGNBrr128 VR128:$src1, VR128:$src2)>; + def : Pat<(X86psignw VR128:$src1, VR128:$src2), + (PSIGNWrr128 VR128:$src1, VR128:$src2)>; + def : Pat<(X86psignd VR128:$src1, VR128:$src2), + (PSIGNDrr128 VR128:$src1, VR128:$src2)>; +} + +let Predicates = [HasAVX] in { + def : Pat<(X86pshufb VR128:$src, VR128:$mask), + (VPSHUFBrr128 VR128:$src, VR128:$mask)>; + def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))), + (VPSHUFBrm128 VR128:$src, addr:$mask)>; + + def : Pat<(X86psignb VR128:$src1, VR128:$src2), + (VPSIGNBrr128 VR128:$src1, VR128:$src2)>; + def : Pat<(X86psignw VR128:$src1, VR128:$src2), + (VPSIGNWrr128 VR128:$src1, VR128:$src2)>; + def : Pat<(X86psignd VR128:$src1, VR128:$src2), + (VPSIGNDrr128 VR128:$src1, VR128:$src2)>; +} + +//===---------------------------------------------------------------------===// +// SSSE3 - Packed Align Instruction Patterns +//===---------------------------------------------------------------------===// + +multiclass ssse3_palign<string asm, bit Is2Addr = 1> { + def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + []>, OpSize; + def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + []>, OpSize; +} + +let Predicates = [HasAVX] in + defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V; +let Constraints = "$src1 = $dst", Predicates = [HasSSSE3] in + defm PALIGN : ssse3_palign<"palignr">; + +let Predicates = [HasSSSE3] in { +def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +} + +let Predicates = [HasAVX] in { +def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))), + (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; +} + +//===---------------------------------------------------------------------===// +// SSSE3 - Thread synchronization +//===---------------------------------------------------------------------===// + +let usesCustomInserter = 1 in { +def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), + [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>; +def MWAIT : PseudoI<(outs), (ins GR32:$src1, GR32:$src2), + [(int_x86_sse3_mwait GR32:$src1, GR32:$src2)]>; +} + +let Uses = [EAX, ECX, EDX] in +def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, TB, + Requires<[HasSSE3]>; +let Uses = [ECX, EAX] in +def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", []>, TB, + Requires<[HasSSE3]>; + +def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>; +def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>; + +def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>, + Requires<[In32BitMode]>; +def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>, + Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// SSE4.1 - Packed Move with Sign/Zero Extend +//===----------------------------------------------------------------------===// + +multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> { + def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId VR128:$src))]>, OpSize; + + def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>, + OpSize; +} + +let Predicates = [HasAVX] in { +defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>, + VEX; +defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>, + VEX; +defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", int_x86_sse41_pmovsxdq>, + VEX; +defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", int_x86_sse41_pmovzxbw>, + VEX; +defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", int_x86_sse41_pmovzxwd>, + VEX; +defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>, + VEX; +} + +defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>; +defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>; +defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>; +defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>; +defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>; +defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>; + +let Predicates = [HasSSE41] in { + // Common patterns involving scalar load. + def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), + (PMOVSXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), + (PMOVSXBWrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), + (PMOVSXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), + (PMOVSXWDrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), + (PMOVSXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), + (PMOVSXDQrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), + (PMOVZXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), + (PMOVZXBWrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), + (PMOVZXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), + (PMOVZXWDrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), + (PMOVZXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), + (PMOVZXDQrm addr:$src)>; +} + +let Predicates = [HasAVX] in { + // Common patterns involving scalar load. + def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), + (VPMOVSXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), + (VPMOVSXBWrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), + (VPMOVSXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), + (VPMOVSXWDrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), + (VPMOVSXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), + (VPMOVSXDQrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), + (VPMOVZXBWrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), + (VPMOVZXBWrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), + (VPMOVZXWDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), + (VPMOVZXWDrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), + (VPMOVZXDQrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), + (VPMOVZXDQrm addr:$src)>; +} + + +multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> { + def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId VR128:$src))]>, OpSize; + + def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>, + OpSize; +} + +let Predicates = [HasAVX] in { +defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>, + VEX; +defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>, + VEX; +defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd>, + VEX; +defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq>, + VEX; +} + +defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>; +defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>; +defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>; +defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>; + +let Predicates = [HasSSE41] in { + // Common patterns involving scalar load + def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), + (PMOVSXBDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)), + (PMOVSXWQrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)), + (PMOVZXBDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)), + (PMOVZXWQrm addr:$src)>; +} + +let Predicates = [HasAVX] in { + // Common patterns involving scalar load + def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), + (VPMOVSXBDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)), + (VPMOVSXWQrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)), + (VPMOVZXBDrm addr:$src)>; + def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)), + (VPMOVZXWQrm addr:$src)>; +} + +multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> { + def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId VR128:$src))]>, OpSize; + + // Expecting a i16 load any extended to i32 value. + def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId (bitconvert + (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>, + OpSize; +} + +let Predicates = [HasAVX] in { +defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>, + VEX; +defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>, + VEX; +} +defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>; +defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>; + +let Predicates = [HasSSE41] in { + // Common patterns involving scalar load + def : Pat<(int_x86_sse41_pmovsxbq + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVSXBQrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxbq + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (PMOVZXBQrm addr:$src)>; +} + +let Predicates = [HasAVX] in { + // Common patterns involving scalar load + def : Pat<(int_x86_sse41_pmovsxbq + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVSXBQrm addr:$src)>; + + def : Pat<(int_x86_sse41_pmovzxbq + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), + (VPMOVZXBQrm addr:$src)>; +} + +//===----------------------------------------------------------------------===// +// SSE4.1 - Extract Instructions +//===----------------------------------------------------------------------===// + +/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem +multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), + (ins VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>, + OpSize; + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, OpSize; +// FIXME: +// There's an AssertZext in the way of writing the store pattern +// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst) +} + +let Predicates = [HasAVX] in { + defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; + def VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "vpextrb\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, OpSize, VEX; +} + +defm PEXTRB : SS41I_extract8<0x14, "pextrb">; + + +/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination +multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, OpSize; +// FIXME: +// There's an AssertZext in the way of writing the store pattern +// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst) +} + +let Predicates = [HasAVX] in + defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; + +defm PEXTRW : SS41I_extract16<0x15, "pextrw">; + + +/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination +multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), + (ins VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR32:$dst, + (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize; + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (extractelt (v4i32 VR128:$src1), imm:$src2), + addr:$dst)]>, OpSize; +} + +let Predicates = [HasAVX] in + defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; + +defm PEXTRD : SS41I_extract32<0x16, "pextrd">; + +/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination +multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), + (ins VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR64:$dst, + (extractelt (v2i64 VR128:$src1), imm:$src2))]>, OpSize, REX_W; + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (extractelt (v2i64 VR128:$src1), imm:$src2), + addr:$dst)]>, OpSize, REX_W; +} + +let Predicates = [HasAVX] in + defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; + +defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; + +/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory +/// destination +multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), + (ins VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR32:$dst, + (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>, + OpSize; + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), + addr:$dst)]>, OpSize; +} + +let Predicates = [HasAVX] in { + defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX; + def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, OpSize, VEX; +} +defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; + +// Also match an EXTRACTPS store when the store is done as f32 instead of i32. +def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), + imm:$src2))), + addr:$dst), + (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, + Requires<[HasSSE41]>; +def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), + imm:$src2))), + addr:$dst), + (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, + Requires<[HasAVX]>; + +//===----------------------------------------------------------------------===// +// SSE4.1 - Insert Instructions +//===----------------------------------------------------------------------===// + +multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), + imm:$src3))]>, OpSize; +} + +let Predicates = [HasAVX] in + defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; +let Constraints = "$src1 = $dst" in + defm PINSRB : SS41I_insert8<0x20, "pinsrb">; + +multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR32:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, + OpSize; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), + imm:$src3)))]>, OpSize; +} + +let Predicates = [HasAVX] in + defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; +let Constraints = "$src1 = $dst" in + defm PINSRD : SS41I_insert32<0x22, "pinsrd">; + +multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR64:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, + OpSize; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), + imm:$src3)))]>, OpSize; +} + +let Predicates = [HasAVX] in + defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; +let Constraints = "$src1 = $dst" in + defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; + +// insertps has a few different modes, there's the first two here below which +// are optimized inserts that won't zero arbitrary elements in the destination +// vector. The next one matches the intrinsic and could zero arbitrary elements +// in the target vector. +multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, u32u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>, + OpSize; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86insrtps VR128:$src1, + (v4f32 (scalar_to_vector (loadf32 addr:$src2))), + imm:$src3))]>, OpSize; +} + +let Constraints = "$src1 = $dst" in + defm INSERTPS : SS41I_insertf32<0x21, "insertps">; +let Predicates = [HasAVX] in + defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V; + +def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3), + (VINSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>, + Requires<[HasAVX]>; +def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3), + (INSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>, + Requires<[HasSSE41]>; + +//===----------------------------------------------------------------------===// +// SSE4.1 - Round Instructions +//===----------------------------------------------------------------------===// + +multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + PatFrag mem_frag32, PatFrag mem_frag64, + Intrinsic V4F32Int, Intrinsic V2F64Int> { + // Intrinsic operation, reg. + // Vector intrinsic operation, reg + def PSr : SS4AIi8<opcps, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>, + OpSize; + + // Vector intrinsic operation, mem + def PSm : Ii8<opcps, MRMSrcMem, + (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>, + TA, OpSize, + Requires<[HasSSE41]>; + + // Vector intrinsic operation, reg + def PDr : SS4AIi8<opcpd, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>, + OpSize; + + // Vector intrinsic operation, mem + def PDm : SS4AIi8<opcpd, MRMSrcMem, + (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>, + OpSize; +} + +multiclass sse41_fp_unop_rm_avx_p<bits<8> opcps, bits<8> opcpd, + RegisterClass RC, X86MemOperand x86memop, string OpcodeStr> { + // Intrinsic operation, reg. + // Vector intrinsic operation, reg + def PSr_AVX : SS4AIi8<opcps, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, OpSize; + + // Vector intrinsic operation, mem + def PSm_AVX : Ii8<opcps, MRMSrcMem, + (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, TA, OpSize, Requires<[HasSSE41]>; + + // Vector intrinsic operation, reg + def PDr_AVX : SS4AIi8<opcpd, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, OpSize; + + // Vector intrinsic operation, mem + def PDm_AVX : SS4AIi8<opcpd, MRMSrcMem, + (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), + !strconcat(OpcodeStr, + "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, OpSize; +} + +multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd, + string OpcodeStr, + Intrinsic F32Int, + Intrinsic F64Int, bit Is2Addr = 1> { + // Intrinsic operation, reg. + def SSr : SS4AIi8<opcss, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>, + OpSize; + + // Intrinsic operation, mem. + def SSm : SS4AIi8<opcss, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, + OpSize; + + // Intrinsic operation, reg. + def SDr : SS4AIi8<opcsd, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>, + OpSize; + + // Intrinsic operation, mem. + def SDm : SS4AIi8<opcsd, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, + OpSize; +} + +multiclass sse41_fp_binop_rm_avx_s<bits<8> opcss, bits<8> opcsd, + string OpcodeStr> { + // Intrinsic operation, reg. + def SSr_AVX : SS4AIi8<opcss, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, OpSize; + + // Intrinsic operation, mem. + def SSm_AVX : SS4AIi8<opcss, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, OpSize; + + // Intrinsic operation, reg. + def SDr_AVX : SS4AIi8<opcsd, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, OpSize; + + // Intrinsic operation, mem. + def SDm_AVX : SS4AIi8<opcsd, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, OpSize; +} + +// FP round - roundss, roundps, roundsd, roundpd +let Predicates = [HasAVX] in { + // Intrinsic form + defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128, + memopv4f32, memopv2f64, + int_x86_sse41_round_ps, + int_x86_sse41_round_pd>, VEX; + defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256, + memopv8f32, memopv4f64, + int_x86_avx_round_ps_256, + int_x86_avx_round_pd_256>, VEX; + defm VROUND : sse41_fp_binop_rm<0x0A, 0x0B, "vround", + int_x86_sse41_round_ss, + int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG; + + // Instructions for the assembler + defm VROUND : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR128, f128mem, "vround">, + VEX; + defm VROUNDY : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR256, f256mem, "vround">, + VEX; + defm VROUND : sse41_fp_binop_rm_avx_s<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG; +} + +defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, + memopv4f32, memopv2f64, + int_x86_sse41_round_ps, int_x86_sse41_round_pd>; +let Constraints = "$src1 = $dst" in +defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", + int_x86_sse41_round_ss, int_x86_sse41_round_sd>; + +//===----------------------------------------------------------------------===// +// SSE4.1 - Packed Bit Test +//===----------------------------------------------------------------------===// + +// ptest instruction we'll lower to this in X86ISelLowering primarily from +// the intel intrinsic that corresponds to this. +let Defs = [EFLAGS], Predicates = [HasAVX] in { +def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>, + OpSize, VEX; +def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS,(X86ptest VR128:$src1, (memopv4f32 addr:$src2)))]>, + OpSize, VEX; + +def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, + OpSize, VEX; +def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS,(X86ptest VR256:$src1, (memopv4i64 addr:$src2)))]>, + OpSize, VEX; +} + +let Defs = [EFLAGS] in { +def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), + "ptest \t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>, + OpSize; +def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), + "ptest \t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (memopv4f32 addr:$src2)))]>, + OpSize; +} + +// The bit test instructions below are AVX only +multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> { + def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, OpSize, VEX; + def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, + OpSize, VEX; +} + +let Defs = [EFLAGS], Predicates = [HasAVX] in { +defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>; +defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>; +defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>; +defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>; +} + +//===----------------------------------------------------------------------===// +// SSE4.1 - Misc Instructions +//===----------------------------------------------------------------------===// + +let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { + def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "popcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>, + OpSize, XS; + def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "popcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctpop (loadi16 addr:$src))), + (implicit EFLAGS)]>, OpSize, XS; + + def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "popcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>, + XS; + def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "popcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctpop (loadi32 addr:$src))), + (implicit EFLAGS)]>, XS; + + def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "popcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>, + XS; + def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "popcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctpop (loadi64 addr:$src))), + (implicit EFLAGS)]>, XS; +} + + + +// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. +multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, + Intrinsic IntId128> { + def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize; + def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (IntId128 + (bitconvert (memopv8i16 addr:$src))))]>, OpSize; +} + +let Predicates = [HasAVX] in +defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", + int_x86_sse41_phminposuw>, VEX; +defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", + int_x86_sse41_phminposuw>; + +/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator +multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId128, bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, OpSize; + def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (IntId128 VR128:$src1, + (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; +} + +let Predicates = [HasAVX] in { + let isCommutable = 0 in + defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, + 0>, VEX_4V; + defm VPCMPEQQ : SS41I_binop_rm_int<0x29, "vpcmpeqq", int_x86_sse41_pcmpeqq, + 0>, VEX_4V; + defm VPMINSB : SS41I_binop_rm_int<0x38, "vpminsb", int_x86_sse41_pminsb, + 0>, VEX_4V; + defm VPMINSD : SS41I_binop_rm_int<0x39, "vpminsd", int_x86_sse41_pminsd, + 0>, VEX_4V; + defm VPMINUD : SS41I_binop_rm_int<0x3B, "vpminud", int_x86_sse41_pminud, + 0>, VEX_4V; + defm VPMINUW : SS41I_binop_rm_int<0x3A, "vpminuw", int_x86_sse41_pminuw, + 0>, VEX_4V; + defm VPMAXSB : SS41I_binop_rm_int<0x3C, "vpmaxsb", int_x86_sse41_pmaxsb, + 0>, VEX_4V; + defm VPMAXSD : SS41I_binop_rm_int<0x3D, "vpmaxsd", int_x86_sse41_pmaxsd, + 0>, VEX_4V; + defm VPMAXUD : SS41I_binop_rm_int<0x3F, "vpmaxud", int_x86_sse41_pmaxud, + 0>, VEX_4V; + defm VPMAXUW : SS41I_binop_rm_int<0x3E, "vpmaxuw", int_x86_sse41_pmaxuw, + 0>, VEX_4V; + defm VPMULDQ : SS41I_binop_rm_int<0x28, "vpmuldq", int_x86_sse41_pmuldq, + 0>, VEX_4V; + + def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)), + (VPCMPEQQrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))), + (VPCMPEQQrm VR128:$src1, addr:$src2)>; +} + +let Constraints = "$src1 = $dst" in { + let isCommutable = 0 in + defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>; + defm PCMPEQQ : SS41I_binop_rm_int<0x29, "pcmpeqq", int_x86_sse41_pcmpeqq>; + defm PMINSB : SS41I_binop_rm_int<0x38, "pminsb", int_x86_sse41_pminsb>; + defm PMINSD : SS41I_binop_rm_int<0x39, "pminsd", int_x86_sse41_pminsd>; + defm PMINUD : SS41I_binop_rm_int<0x3B, "pminud", int_x86_sse41_pminud>; + defm PMINUW : SS41I_binop_rm_int<0x3A, "pminuw", int_x86_sse41_pminuw>; + defm PMAXSB : SS41I_binop_rm_int<0x3C, "pmaxsb", int_x86_sse41_pmaxsb>; + defm PMAXSD : SS41I_binop_rm_int<0x3D, "pmaxsd", int_x86_sse41_pmaxsd>; + defm PMAXUD : SS41I_binop_rm_int<0x3F, "pmaxud", int_x86_sse41_pmaxud>; + defm PMAXUW : SS41I_binop_rm_int<0x3E, "pmaxuw", int_x86_sse41_pmaxuw>; + defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq>; +} + +def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)), + (PCMPEQQrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))), + (PCMPEQQrm VR128:$src1, addr:$src2)>; + +/// SS48I_binop_rm - Simple SSE41 binary operator. +multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>, + OpSize; + def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (OpNode VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2))))]>, + OpSize; +} + +let Predicates = [HasAVX] in + defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, 0>, VEX_4V; +let Constraints = "$src1 = $dst" in + defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32>; + +/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate +multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr = 1> { + let isCommutable = 1 in + def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u32u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, + OpSize; + def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u32u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, + (IntId RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, + OpSize; +} + +let Predicates = [HasAVX] in { + let isCommutable = 0 in { + defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, + VR128, memopv16i8, i128mem, 0>, VEX_4V; + defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, + VR128, memopv16i8, i128mem, 0>, VEX_4V; + defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", + int_x86_avx_blend_ps_256, VR256, memopv32i8, i256mem, 0>, VEX_4V; + defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", + int_x86_avx_blend_pd_256, VR256, memopv32i8, i256mem, 0>, VEX_4V; + defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, + VR128, memopv16i8, i128mem, 0>, VEX_4V; + defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, + VR128, memopv16i8, i128mem, 0>, VEX_4V; + } + defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, + VR128, memopv16i8, i128mem, 0>, VEX_4V; + defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, + VR128, memopv16i8, i128mem, 0>, VEX_4V; + defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, + VR256, memopv32i8, i256mem, 0>, VEX_4V; +} + +let Constraints = "$src1 = $dst" in { + let isCommutable = 0 in { + defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, + VR128, memopv16i8, i128mem>; + defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd, + VR128, memopv16i8, i128mem>; + defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, + VR128, memopv16i8, i128mem>; + defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, + VR128, memopv16i8, i128mem>; + } + defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, + VR128, memopv16i8, i128mem>; + defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, + VR128, memopv16i8, i128mem>; +} + +/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators +let Predicates = [HasAVX] in { +multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, + RegisterClass RC, X86MemOperand x86memop, + PatFrag mem_frag, Intrinsic IntId> { + def rr : I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))], + SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; + + def rm : I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, + (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)), + RC:$src3))], + SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM; +} +} + +defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, i128mem, + memopv16i8, int_x86_sse41_blendvpd>; +defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem, + memopv16i8, int_x86_sse41_blendvps>; +defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, + memopv16i8, int_x86_sse41_pblendvb>; +defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem, + memopv32i8, int_x86_avx_blendv_pd_256>; +defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem, + memopv32i8, int_x86_avx_blendv_ps_256>; + +let Predicates = [HasAVX] in { + def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1), + (v16i8 VR128:$src2))), + (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), + (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1), + (v4f32 VR128:$src2))), + (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), + (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1), + (v2f64 VR128:$src2))), + (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1), + (v8i32 VR256:$src2))), + (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1), + (v8f32 VR256:$src2))), + (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1), + (v4i64 VR256:$src2))), + (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), + (v4f64 VR256:$src2))), + (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; +} + +/// SS41I_ternary_int - SSE 4.1 ternary operator +let Uses = [XMM0], Constraints = "$src1 = $dst" in { + multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> { + def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>, + OpSize; + + def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $dst|$dst, $src2}"), + [(set VR128:$dst, + (IntId VR128:$src1, + (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize; + } +} + +defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>; +defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>; +defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>; + +let Predicates = [HasSSE41] in { + def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), + (v16i8 VR128:$src2))), + (PBLENDVBrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), + (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1), + (v4f32 VR128:$src2))), + (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), + (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), + (v2f64 VR128:$src2))), + (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; +} + +let Predicates = [HasAVX] in +def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, + OpSize, VEX; +def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movntdqa\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, + OpSize; + +//===----------------------------------------------------------------------===// +// SSE4.2 - Compare Instructions +//===----------------------------------------------------------------------===// + +/// SS42I_binop_rm_int - Simple SSE 4.2 binary operator +multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId128, bit Is2Addr = 1> { + def rr : SS428I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, + OpSize; + def rm : SS428I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (IntId128 VR128:$src1, + (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; +} + +let Predicates = [HasAVX] in { + defm VPCMPGTQ : SS42I_binop_rm_int<0x37, "vpcmpgtq", int_x86_sse42_pcmpgtq, + 0>, VEX_4V; + + def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)), + (VPCMPGTQrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))), + (VPCMPGTQrm VR128:$src1, addr:$src2)>; +} + +let Constraints = "$src1 = $dst" in + defm PCMPGTQ : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>; + +def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)), + (PCMPGTQrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))), + (PCMPGTQrm VR128:$src1, addr:$src2)>; + +//===----------------------------------------------------------------------===// +// SSE4.2 - String/text Processing Instructions +//===----------------------------------------------------------------------===// + +// Packed Compare Implicit Length Strings, Return Mask +multiclass pseudo_pcmpistrm<string asm> { + def REG : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2, + imm:$src3))]>; + def MEM : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 + VR128:$src1, (load addr:$src2), imm:$src3))]>; +} + +let Defs = [EFLAGS], usesCustomInserter = 1 in { + defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[HasSSE42]>; + defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>; +} + +let Defs = [XMM0, EFLAGS], Predicates = [HasAVX] in { + def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX; + def VPCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX; +} + +let Defs = [XMM0, EFLAGS] in { + def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize; + def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize; +} + +// Packed Compare Explicit Length Strings, Return Mask +multiclass pseudo_pcmpestrm<string asm> { + def REG : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, VR128:$src3, i8imm:$src5), + [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 + VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>; + def MEM : PseudoI<(outs VR128:$dst), + (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 + VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>; +} + +let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { + defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[HasSSE42]>; + defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>; +} + +let Predicates = [HasAVX], + Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in { + def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src3, i8imm:$src5), + "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX; + def VPCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX; +} + +let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in { + def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src3, i8imm:$src5), + "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize; + def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize; +} + +// Packed Compare Implicit Length Strings, Return Index +let Defs = [ECX, EFLAGS] in { + multiclass SS42AI_pcmpistri<Intrinsic IntId128, string asm = "pcmpistri"> { + def rr : SS42AI<0x63, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + [(set ECX, (IntId128 VR128:$src1, VR128:$src2, imm:$src3)), + (implicit EFLAGS)]>, OpSize; + def rm : SS42AI<0x63, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + [(set ECX, (IntId128 VR128:$src1, (load addr:$src2), imm:$src3)), + (implicit EFLAGS)]>, OpSize; + } +} + +let Predicates = [HasAVX] in { +defm VPCMPISTRI : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128, "vpcmpistri">, + VEX; +defm VPCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128, "vpcmpistri">, + VEX; +defm VPCMPISTRIC : SS42AI_pcmpistri<int_x86_sse42_pcmpistric128, "vpcmpistri">, + VEX; +defm VPCMPISTRIO : SS42AI_pcmpistri<int_x86_sse42_pcmpistrio128, "vpcmpistri">, + VEX; +defm VPCMPISTRIS : SS42AI_pcmpistri<int_x86_sse42_pcmpistris128, "vpcmpistri">, + VEX; +defm VPCMPISTRIZ : SS42AI_pcmpistri<int_x86_sse42_pcmpistriz128, "vpcmpistri">, + VEX; +} + +defm PCMPISTRI : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128>; +defm PCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128>; +defm PCMPISTRIC : SS42AI_pcmpistri<int_x86_sse42_pcmpistric128>; +defm PCMPISTRIO : SS42AI_pcmpistri<int_x86_sse42_pcmpistrio128>; +defm PCMPISTRIS : SS42AI_pcmpistri<int_x86_sse42_pcmpistris128>; +defm PCMPISTRIZ : SS42AI_pcmpistri<int_x86_sse42_pcmpistriz128>; + +// Packed Compare Explicit Length Strings, Return Index +let Defs = [ECX, EFLAGS], Uses = [EAX, EDX] in { + multiclass SS42AI_pcmpestri<Intrinsic IntId128, string asm = "pcmpestri"> { + def rr : SS42AI<0x61, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src3, i8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + [(set ECX, (IntId128 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5)), + (implicit EFLAGS)]>, OpSize; + def rm : SS42AI<0x61, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src3, i8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + [(set ECX, + (IntId128 VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5)), + (implicit EFLAGS)]>, OpSize; + } +} + +let Predicates = [HasAVX] in { +defm VPCMPESTRI : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128, "vpcmpestri">, + VEX; +defm VPCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128, "vpcmpestri">, + VEX; +defm VPCMPESTRIC : SS42AI_pcmpestri<int_x86_sse42_pcmpestric128, "vpcmpestri">, + VEX; +defm VPCMPESTRIO : SS42AI_pcmpestri<int_x86_sse42_pcmpestrio128, "vpcmpestri">, + VEX; +defm VPCMPESTRIS : SS42AI_pcmpestri<int_x86_sse42_pcmpestris128, "vpcmpestri">, + VEX; +defm VPCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128, "vpcmpestri">, + VEX; +} + +defm PCMPESTRI : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128>; +defm PCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128>; +defm PCMPESTRIC : SS42AI_pcmpestri<int_x86_sse42_pcmpestric128>; +defm PCMPESTRIO : SS42AI_pcmpestri<int_x86_sse42_pcmpestrio128>; +defm PCMPESTRIS : SS42AI_pcmpestri<int_x86_sse42_pcmpestris128>; +defm PCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128>; + +//===----------------------------------------------------------------------===// +// SSE4.2 - CRC Instructions +//===----------------------------------------------------------------------===// + +// No CRC instructions have AVX equivalents + +// crc intrinsic instruction +// This set of instructions are only rm, the only difference is the size +// of r and m. +let Constraints = "$src1 = $dst" in { + def CRC32r32m8 : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$src1, i8mem:$src2), + "crc32{b} \t{$src2, $src1|$src1, $src2}", + [(set GR32:$dst, + (int_x86_sse42_crc32_32_8 GR32:$src1, + (load addr:$src2)))]>; + def CRC32r32r8 : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst), + (ins GR32:$src1, GR8:$src2), + "crc32{b} \t{$src2, $src1|$src1, $src2}", + [(set GR32:$dst, + (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>; + def CRC32r32m16 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$src1, i16mem:$src2), + "crc32{w} \t{$src2, $src1|$src1, $src2}", + [(set GR32:$dst, + (int_x86_sse42_crc32_32_16 GR32:$src1, + (load addr:$src2)))]>, + OpSize; + def CRC32r32r16 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), + (ins GR32:$src1, GR16:$src2), + "crc32{w} \t{$src2, $src1|$src1, $src2}", + [(set GR32:$dst, + (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>, + OpSize; + def CRC32r32m32 : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$src1, i32mem:$src2), + "crc32{l} \t{$src2, $src1|$src1, $src2}", + [(set GR32:$dst, + (int_x86_sse42_crc32_32_32 GR32:$src1, + (load addr:$src2)))]>; + def CRC32r32r32 : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2), + "crc32{l} \t{$src2, $src1|$src1, $src2}", + [(set GR32:$dst, + (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>; + def CRC32r64m8 : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$src1, i8mem:$src2), + "crc32{b} \t{$src2, $src1|$src1, $src2}", + [(set GR64:$dst, + (int_x86_sse42_crc32_64_8 GR64:$src1, + (load addr:$src2)))]>, + REX_W; + def CRC32r64r8 : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst), + (ins GR64:$src1, GR8:$src2), + "crc32{b} \t{$src2, $src1|$src1, $src2}", + [(set GR64:$dst, + (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>, + REX_W; + def CRC32r64m64 : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$src1, i64mem:$src2), + "crc32{q} \t{$src2, $src1|$src1, $src2}", + [(set GR64:$dst, + (int_x86_sse42_crc32_64_64 GR64:$src1, + (load addr:$src2)))]>, + REX_W; + def CRC32r64r64 : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "crc32{q} \t{$src2, $src1|$src1, $src2}", + [(set GR64:$dst, + (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>, + REX_W; +} + +//===----------------------------------------------------------------------===// +// AES-NI Instructions +//===----------------------------------------------------------------------===// + +multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId128, bit Is2Addr = 1> { + def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, + OpSize; + def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (IntId128 VR128:$src1, + (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; +} + +// Perform One Round of an AES Encryption/Decryption Flow +let Predicates = [HasAVX, HasAES] in { + defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", + int_x86_aesni_aesenc, 0>, VEX_4V; + defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", + int_x86_aesni_aesenclast, 0>, VEX_4V; + defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", + int_x86_aesni_aesdec, 0>, VEX_4V; + defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", + int_x86_aesni_aesdeclast, 0>, VEX_4V; +} + +let Constraints = "$src1 = $dst" in { + defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", + int_x86_aesni_aesenc>; + defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", + int_x86_aesni_aesenclast>; + defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", + int_x86_aesni_aesdec>; + defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", + int_x86_aesni_aesdeclast>; +} + +let Predicates = [HasAES] in { + def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)), + (AESENCrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))), + (AESENCrm VR128:$src1, addr:$src2)>; + def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)), + (AESENCLASTrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))), + (AESENCLASTrm VR128:$src1, addr:$src2)>; + def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)), + (AESDECrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))), + (AESDECrm VR128:$src1, addr:$src2)>; + def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)), + (AESDECLASTrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))), + (AESDECLASTrm VR128:$src1, addr:$src2)>; +} + +let Predicates = [HasAVX, HasAES], AddedComplexity = 20 in { + def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)), + (VAESENCrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))), + (VAESENCrm VR128:$src1, addr:$src2)>; + def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)), + (VAESENCLASTrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))), + (VAESENCLASTrm VR128:$src1, addr:$src2)>; + def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)), + (VAESDECrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))), + (VAESDECrm VR128:$src1, addr:$src2)>; + def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)), + (VAESDECLASTrr VR128:$src1, VR128:$src2)>; + def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))), + (VAESDECLASTrm VR128:$src1, addr:$src2)>; +} + +// Perform the AES InvMixColumn Transformation +let Predicates = [HasAVX, HasAES] in { + def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1), + "vaesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, + (int_x86_aesni_aesimc VR128:$src1))]>, + OpSize, VEX; + def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1), + "vaesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, + (int_x86_aesni_aesimc (bitconvert (memopv2i64 addr:$src1))))]>, + OpSize, VEX; +} +def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1), + "aesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, + (int_x86_aesni_aesimc VR128:$src1))]>, + OpSize; +def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1), + "aesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, + (int_x86_aesni_aesimc (bitconvert (memopv2i64 addr:$src1))))]>, + OpSize; + +// AES Round Key Generation Assist +let Predicates = [HasAVX, HasAES] in { + def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, i8imm:$src2), + "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, + OpSize, VEX; + def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, i8imm:$src2), + "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)), + imm:$src2))]>, + OpSize, VEX; +} +def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, i8imm:$src2), + "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, + OpSize; +def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, i8imm:$src2), + "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)), + imm:$src2))]>, + OpSize; + +//===----------------------------------------------------------------------===// +// CLMUL Instructions +//===----------------------------------------------------------------------===// + +// Carry-less Multiplication instructions +let Constraints = "$src1 = $dst" in { +def PCLMULQDQrr : CLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", + []>; + +def PCLMULQDQrm : CLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", + []>; +} + +// AVX carry-less Multiplication instructions +def VPCLMULQDQrr : AVXCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), + "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>; + +def VPCLMULQDQrm : AVXCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>; + + +multiclass pclmul_alias<string asm, int immop> { + def : InstAlias<!strconcat("pclmul", asm, + "dq {$src, $dst|$dst, $src}"), + (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>; + + def : InstAlias<!strconcat("pclmul", asm, + "dq {$src, $dst|$dst, $src}"), + (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>; + + def : InstAlias<!strconcat("vpclmul", asm, + "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), + (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>; + + def : InstAlias<!strconcat("vpclmul", asm, + "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), + (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>; +} +defm : pclmul_alias<"hqhq", 0x11>; +defm : pclmul_alias<"hqlq", 0x01>; +defm : pclmul_alias<"lqhq", 0x10>; +defm : pclmul_alias<"lqlq", 0x00>; + +//===----------------------------------------------------------------------===// +// AVX Instructions +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// VBROADCAST - Load from memory and broadcast to all elements of the +// destination operand +// +class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, Intrinsic Int> : + AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (Int addr:$src))]>, VEX; + +def VBROADCASTSS : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem, + int_x86_avx_vbroadcastss>; +def VBROADCASTSSY : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem, + int_x86_avx_vbroadcastss_256>; +def VBROADCASTSD : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem, + int_x86_avx_vbroadcast_sd_256>; +def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, + int_x86_avx_vbroadcastf128_pd_256>; + +def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), + (VBROADCASTF128 addr:$src)>; + +def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), + (VBROADCASTSSY addr:$src)>; +def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), + (VBROADCASTSD addr:$src)>; +def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))), + (VBROADCASTSSY addr:$src)>; +def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))), + (VBROADCASTSD addr:$src)>; + +def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))), + (VBROADCASTSS addr:$src)>; +def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VBROADCASTSS addr:$src)>; + +//===----------------------------------------------------------------------===// +// VINSERTF128 - Insert packed floating-point values +// +def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR128:$src2, i8imm:$src3), + "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, VEX_4V; +def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f128mem:$src2, i8imm:$src3), + "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, VEX_4V; + +def : Pat<(int_x86_avx_vinsertf128_pd_256 VR256:$src1, VR128:$src2, imm:$src3), + (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>; +def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3), + (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>; +def : Pat<(int_x86_avx_vinsertf128_si_256 VR256:$src1, VR128:$src2, imm:$src3), + (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>; + +def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; +def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2), + (i32 imm)), + (VINSERTF128rr VR256:$src1, VR128:$src2, + (INSERT_get_vinsertf128_imm VR256:$ins))>; + +//===----------------------------------------------------------------------===// +// VEXTRACTF128 - Extract packed floating-point values +// +def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), + (ins VR256:$src1, i8imm:$src2), + "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, VEX; +def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), + (ins f128mem:$dst, VR256:$src1, i8imm:$src2), + "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, VEX; + +def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2), + (VEXTRACTF128rr VR256:$src1, imm:$src2)>; +def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2), + (VEXTRACTF128rr VR256:$src1, imm:$src2)>; +def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2), + (VEXTRACTF128rr VR256:$src1, imm:$src2)>; + +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v4f32 (VEXTRACTF128rr + (v8f32 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v2f64 (VEXTRACTF128rr + (v4f64 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v4i32 (VEXTRACTF128rr + (v8i32 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v2i64 (VEXTRACTF128rr + (v4i64 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v8i16 (VEXTRACTF128rr + (v16i16 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; +def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)), + (v16i8 (VEXTRACTF128rr + (v32i8 VR256:$src1), + (EXTRACT_get_vextractf128_imm VR128:$ext)))>; + +//===----------------------------------------------------------------------===// +// VMASKMOV - Conditional SIMD Packed Loads and Stores +// +multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, + Intrinsic IntLd, Intrinsic IntLd256, + Intrinsic IntSt, Intrinsic IntSt256, + PatFrag pf128, PatFrag pf256> { + def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, + VEX_4V; + def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, + VEX_4V; + def mr : AVX8I<opc_mr, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V; + def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V; +} + +defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", + int_x86_avx_maskload_ps, + int_x86_avx_maskload_ps_256, + int_x86_avx_maskstore_ps, + int_x86_avx_maskstore_ps_256, + memopv4f32, memopv8f32>; +defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", + int_x86_avx_maskload_pd, + int_x86_avx_maskload_pd_256, + int_x86_avx_maskstore_pd, + int_x86_avx_maskstore_pd_256, + memopv2f64, memopv4f64>; + +//===----------------------------------------------------------------------===// +// VPERMIL - Permute Single and Double Floating-Point Values +// +multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, + RegisterClass RC, X86MemOperand x86memop_f, + X86MemOperand x86memop_i, PatFrag f_frag, PatFrag i_frag, + Intrinsic IntVar, Intrinsic IntImm> { + def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V; + def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop_i:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (IntVar RC:$src1, (i_frag addr:$src2)))]>, VEX_4V; + + def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (IntImm RC:$src1, imm:$src2))]>, VEX; + def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), + (ins x86memop_f:$src1, i8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (IntImm (f_frag addr:$src1), imm:$src2))]>, VEX; +} + +defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, + memopv4f32, memopv4i32, + int_x86_avx_vpermilvar_ps, + int_x86_avx_vpermil_ps>; +defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, + memopv8f32, memopv8i32, + int_x86_avx_vpermilvar_ps_256, + int_x86_avx_vpermil_ps_256>; +defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, + memopv2f64, memopv2i64, + int_x86_avx_vpermilvar_pd, + int_x86_avx_vpermil_pd>; +defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, + memopv4f64, memopv4i64, + int_x86_avx_vpermilvar_pd_256, + int_x86_avx_vpermil_pd_256>; + +def : Pat<(v8f32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))), + (VPERMILPSYri VR256:$src1, imm:$imm)>; +def : Pat<(v4f64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))), + (VPERMILPDYri VR256:$src1, imm:$imm)>; +def : Pat<(v8i32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))), + (VPERMILPSYri VR256:$src1, imm:$imm)>; +def : Pat<(v4i64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))), + (VPERMILPDYri VR256:$src1, imm:$imm)>; + +//===----------------------------------------------------------------------===// +// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks +// +def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, i8imm:$src3), + "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, VEX_4V; +def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, i8imm:$src3), + "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, VEX_4V; + +def : Pat<(int_x86_avx_vperm2f128_ps_256 VR256:$src1, VR256:$src2, imm:$src3), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>; +def : Pat<(int_x86_avx_vperm2f128_pd_256 VR256:$src1, VR256:$src2, imm:$src3), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>; +def : Pat<(int_x86_avx_vperm2f128_si_256 VR256:$src1, VR256:$src2, imm:$src3), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>; + +def : Pat<(int_x86_avx_vperm2f128_ps_256 + VR256:$src1, (memopv8f32 addr:$src2), imm:$src3), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>; +def : Pat<(int_x86_avx_vperm2f128_pd_256 + VR256:$src1, (memopv4f64 addr:$src2), imm:$src3), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>; +def : Pat<(int_x86_avx_vperm2f128_si_256 + VR256:$src1, (memopv8i32 addr:$src2), imm:$src3), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>; + +def : Pat<(v8f32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v8i32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4f64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v32i8 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v16i16 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; + +//===----------------------------------------------------------------------===// +// VZERO - Zero YMM registers +// +let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, + YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { + // Zero All YMM registers + def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", + [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L, Requires<[HasAVX]>; + + // Zero Upper bits of YMM registers + def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", + [(int_x86_avx_vzeroupper)]>, TB, VEX, Requires<[HasAVX]>; +} + +//===----------------------------------------------------------------------===// +// Half precision conversion instructions +// +let Predicates = [HasAVX, HasF16C] in { + def VCVTPH2PSrm : I<0x13, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), + "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX; + def VCVTPH2PSrr : I<0x13, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX; + def VCVTPH2PSYrm : I<0x13, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), + "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX; + def VCVTPH2PSYrr : I<0x13, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX; + def VCVTPS2PHmr : Ii8<0x1D, MRMDestMem, (outs f64mem:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + TA, OpSize, VEX; + def VCVTPS2PHrr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + TA, OpSize, VEX; + def VCVTPS2PHYmr : Ii8<0x1D, MRMDestMem, (outs f128mem:$dst), + (ins VR256:$src1, i32i8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + TA, OpSize, VEX; + def VCVTPS2PHYrr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), + (ins VR256:$src1, i32i8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + TA, OpSize, VEX; +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td new file mode 100644 index 0000000..8278568 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -0,0 +1,746 @@ +//===- X86InstrShiftRotate.td - Shift and Rotate Instrs ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the shift and rotate instructions. +// +//===----------------------------------------------------------------------===// + +// FIXME: Someone needs to smear multipattern goodness all over this file. + +let Defs = [EFLAGS] in { + +let Constraints = "$src1 = $dst" in { +let Uses = [CL] in { +def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1), + "shl{b}\t{%cl, $dst|$dst, CL}", + [(set GR8:$dst, (shl GR8:$src1, CL))]>; +def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1), + "shl{w}\t{%cl, $dst|$dst, CL}", + [(set GR16:$dst, (shl GR16:$src1, CL))]>, OpSize; +def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1), + "shl{l}\t{%cl, $dst|$dst, CL}", + [(set GR32:$dst, (shl GR32:$src1, CL))]>; +def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1), + "shl{q}\t{%cl, $dst|$dst, %CL}", + [(set GR64:$dst, (shl GR64:$src1, CL))]>; +} // Uses = [CL] + +def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), + "shl{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>; + +let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. +def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), + "shl{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>, OpSize; +def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), + "shl{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>; +def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst), + (ins GR64:$src1, i8imm:$src2), + "shl{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>; + +// NOTE: We don't include patterns for shifts of a register by one, because +// 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one). +def SHL8r1 : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1), + "shl{b}\t$dst", []>; +def SHL16r1 : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1), + "shl{w}\t$dst", []>, OpSize; +def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1), + "shl{l}\t$dst", []>; +def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1), + "shl{q}\t$dst", []>; +} // isConvertibleToThreeAddress = 1 +} // Constraints = "$src = $dst" + + +// FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern +// using CL? +let Uses = [CL] in { +def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst), + "shl{b}\t{%cl, $dst|$dst, CL}", + [(store (shl (loadi8 addr:$dst), CL), addr:$dst)]>; +def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst), + "shl{w}\t{%cl, $dst|$dst, CL}", + [(store (shl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize; +def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst), + "shl{l}\t{%cl, $dst|$dst, CL}", + [(store (shl (loadi32 addr:$dst), CL), addr:$dst)]>; +def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst), + "shl{q}\t{%cl, $dst|$dst, %CL}", + [(store (shl (loadi64 addr:$dst), CL), addr:$dst)]>; +} +def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src), + "shl{b}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src), + "shl{w}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize; +def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src), + "shl{l}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src), + "shl{q}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + +// Shift by 1 +def SHL8m1 : I<0xD0, MRM4m, (outs), (ins i8mem :$dst), + "shl{b}\t$dst", + [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; +def SHL16m1 : I<0xD1, MRM4m, (outs), (ins i16mem:$dst), + "shl{w}\t$dst", + [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize; +def SHL32m1 : I<0xD1, MRM4m, (outs), (ins i32mem:$dst), + "shl{l}\t$dst", + [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>; +def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst), + "shl{q}\t$dst", + [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>; + +let Constraints = "$src1 = $dst" in { +let Uses = [CL] in { +def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1), + "shr{b}\t{%cl, $dst|$dst, CL}", + [(set GR8:$dst, (srl GR8:$src1, CL))]>; +def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1), + "shr{w}\t{%cl, $dst|$dst, CL}", + [(set GR16:$dst, (srl GR16:$src1, CL))]>, OpSize; +def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1), + "shr{l}\t{%cl, $dst|$dst, CL}", + [(set GR32:$dst, (srl GR32:$src1, CL))]>; +def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1), + "shr{q}\t{%cl, $dst|$dst, %CL}", + [(set GR64:$dst, (srl GR64:$src1, CL))]>; +} + +def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), + "shr{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))]>; +def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), + "shr{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))]>, OpSize; +def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), + "shr{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))]>; +def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2), + "shr{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>; + +// Shift right by 1 +def SHR8r1 : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1), + "shr{b}\t$dst", + [(set GR8:$dst, (srl GR8:$src1, (i8 1)))]>; +def SHR16r1 : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1), + "shr{w}\t$dst", + [(set GR16:$dst, (srl GR16:$src1, (i8 1)))]>, OpSize; +def SHR32r1 : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1), + "shr{l}\t$dst", + [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>; +def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1), + "shr{q}\t$dst", + [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>; +} // Constraints = "$src = $dst" + + +let Uses = [CL] in { +def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst), + "shr{b}\t{%cl, $dst|$dst, CL}", + [(store (srl (loadi8 addr:$dst), CL), addr:$dst)]>; +def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst), + "shr{w}\t{%cl, $dst|$dst, CL}", + [(store (srl (loadi16 addr:$dst), CL), addr:$dst)]>, + OpSize; +def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst), + "shr{l}\t{%cl, $dst|$dst, CL}", + [(store (srl (loadi32 addr:$dst), CL), addr:$dst)]>; +def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst), + "shr{q}\t{%cl, $dst|$dst, %CL}", + [(store (srl (loadi64 addr:$dst), CL), addr:$dst)]>; +} +def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src), + "shr{b}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src), + "shr{w}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize; +def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src), + "shr{l}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src), + "shr{q}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + +// Shift by 1 +def SHR8m1 : I<0xD0, MRM5m, (outs), (ins i8mem :$dst), + "shr{b}\t$dst", + [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; +def SHR16m1 : I<0xD1, MRM5m, (outs), (ins i16mem:$dst), + "shr{w}\t$dst", + [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,OpSize; +def SHR32m1 : I<0xD1, MRM5m, (outs), (ins i32mem:$dst), + "shr{l}\t$dst", + [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>; +def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst), + "shr{q}\t$dst", + [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>; + +let Constraints = "$src1 = $dst" in { +let Uses = [CL] in { +def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1), + "sar{b}\t{%cl, $dst|$dst, CL}", + [(set GR8:$dst, (sra GR8:$src1, CL))]>; +def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1), + "sar{w}\t{%cl, $dst|$dst, CL}", + [(set GR16:$dst, (sra GR16:$src1, CL))]>, OpSize; +def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1), + "sar{l}\t{%cl, $dst|$dst, CL}", + [(set GR32:$dst, (sra GR32:$src1, CL))]>; +def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1), + "sar{q}\t{%cl, $dst|$dst, %CL}", + [(set GR64:$dst, (sra GR64:$src1, CL))]>; +} + +def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), + "sar{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))]>; +def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), + "sar{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))]>, + OpSize; +def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), + "sar{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))]>; +def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst), + (ins GR64:$src1, i8imm:$src2), + "sar{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))]>; + +// Shift by 1 +def SAR8r1 : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1), + "sar{b}\t$dst", + [(set GR8:$dst, (sra GR8:$src1, (i8 1)))]>; +def SAR16r1 : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1), + "sar{w}\t$dst", + [(set GR16:$dst, (sra GR16:$src1, (i8 1)))]>, OpSize; +def SAR32r1 : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1), + "sar{l}\t$dst", + [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>; +def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1), + "sar{q}\t$dst", + [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>; +} // Constraints = "$src = $dst" + + +let Uses = [CL] in { +def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst), + "sar{b}\t{%cl, $dst|$dst, CL}", + [(store (sra (loadi8 addr:$dst), CL), addr:$dst)]>; +def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst), + "sar{w}\t{%cl, $dst|$dst, CL}", + [(store (sra (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize; +def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), + "sar{l}\t{%cl, $dst|$dst, CL}", + [(store (sra (loadi32 addr:$dst), CL), addr:$dst)]>; +def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), + "sar{q}\t{%cl, $dst|$dst, %CL}", + [(store (sra (loadi64 addr:$dst), CL), addr:$dst)]>; +} +def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src), + "sar{b}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src), + "sar{w}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize; +def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src), + "sar{l}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src), + "sar{q}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + +// Shift by 1 +def SAR8m1 : I<0xD0, MRM7m, (outs), (ins i8mem :$dst), + "sar{b}\t$dst", + [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; +def SAR16m1 : I<0xD1, MRM7m, (outs), (ins i16mem:$dst), + "sar{w}\t$dst", + [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize; +def SAR32m1 : I<0xD1, MRM7m, (outs), (ins i32mem:$dst), + "sar{l}\t$dst", + [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>; +def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst), + "sar{q}\t$dst", + [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>; + +//===----------------------------------------------------------------------===// +// Rotate instructions +//===----------------------------------------------------------------------===// + +let Constraints = "$src1 = $dst" in { +def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1), + "rcl{b}\t$dst", []>; +def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), + "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>; +let Uses = [CL] in +def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), + "rcl{b}\t{%cl, $dst|$dst, CL}", []>; + +def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1), + "rcl{w}\t$dst", []>, OpSize; +def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), + "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize; +let Uses = [CL] in +def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), + "rcl{w}\t{%cl, $dst|$dst, CL}", []>, OpSize; + +def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1), + "rcl{l}\t$dst", []>; +def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), + "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>; +let Uses = [CL] in +def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), + "rcl{l}\t{%cl, $dst|$dst, CL}", []>; + + +def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1), + "rcl{q}\t$dst", []>; +def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), + "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>; +let Uses = [CL] in +def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), + "rcl{q}\t{%cl, $dst|$dst, CL}", []>; + + +def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1), + "rcr{b}\t$dst", []>; +def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), + "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>; +let Uses = [CL] in +def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), + "rcr{b}\t{%cl, $dst|$dst, CL}", []>; + +def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1), + "rcr{w}\t$dst", []>, OpSize; +def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), + "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize; +let Uses = [CL] in +def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), + "rcr{w}\t{%cl, $dst|$dst, CL}", []>, OpSize; + +def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1), + "rcr{l}\t$dst", []>; +def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), + "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>; +let Uses = [CL] in +def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), + "rcr{l}\t{%cl, $dst|$dst, CL}", []>; + +def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1), + "rcr{q}\t$dst", []>; +def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), + "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>; +let Uses = [CL] in +def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), + "rcr{q}\t{%cl, $dst|$dst, CL}", []>; + +} // Constraints = "$src = $dst" + +def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst), + "rcl{b}\t$dst", []>; +def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt), + "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>; +def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst), + "rcl{w}\t$dst", []>, OpSize; +def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, i8imm:$cnt), + "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize; +def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst), + "rcl{l}\t$dst", []>; +def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, i8imm:$cnt), + "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>; +def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst), + "rcl{q}\t$dst", []>; +def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, i8imm:$cnt), + "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>; + +def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst), + "rcr{b}\t$dst", []>; +def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, i8imm:$cnt), + "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>; +def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst), + "rcr{w}\t$dst", []>, OpSize; +def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, i8imm:$cnt), + "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize; +def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst), + "rcr{l}\t$dst", []>; +def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, i8imm:$cnt), + "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>; +def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst), + "rcr{q}\t$dst", []>; +def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt), + "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>; + +let Uses = [CL] in { +def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst), + "rcl{b}\t{%cl, $dst|$dst, CL}", []>; +def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst), + "rcl{w}\t{%cl, $dst|$dst, CL}", []>, OpSize; +def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst), + "rcl{l}\t{%cl, $dst|$dst, CL}", []>; +def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst), + "rcl{q}\t{%cl, $dst|$dst, CL}", []>; + +def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst), + "rcr{b}\t{%cl, $dst|$dst, CL}", []>; +def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst), + "rcr{w}\t{%cl, $dst|$dst, CL}", []>, OpSize; +def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst), + "rcr{l}\t{%cl, $dst|$dst, CL}", []>; +def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst), + "rcr{q}\t{%cl, $dst|$dst, CL}", []>; +} + +let Constraints = "$src1 = $dst" in { +// FIXME: provide shorter instructions when imm8 == 1 +let Uses = [CL] in { +def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), + "rol{b}\t{%cl, $dst|$dst, CL}", + [(set GR8:$dst, (rotl GR8:$src1, CL))]>; +def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1), + "rol{w}\t{%cl, $dst|$dst, CL}", + [(set GR16:$dst, (rotl GR16:$src1, CL))]>, OpSize; +def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1), + "rol{l}\t{%cl, $dst|$dst, CL}", + [(set GR32:$dst, (rotl GR32:$src1, CL))]>; +def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1), + "rol{q}\t{%cl, $dst|$dst, %CL}", + [(set GR64:$dst, (rotl GR64:$src1, CL))]>; +} + +def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), + "rol{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>; +def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), + "rol{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, + OpSize; +def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), + "rol{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>; +def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), + (ins GR64:$src1, i8imm:$src2), + "rol{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>; + +// Rotate by 1 +def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), + "rol{b}\t$dst", + [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>; +def ROL16r1 : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1), + "rol{w}\t$dst", + [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize; +def ROL32r1 : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1), + "rol{l}\t$dst", + [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>; +def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1), + "rol{q}\t$dst", + [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>; +} // Constraints = "$src = $dst" + +let Uses = [CL] in { +def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst), + "rol{b}\t{%cl, $dst|$dst, CL}", + [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)]>; +def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst), + "rol{w}\t{%cl, $dst|$dst, CL}", + [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize; +def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst), + "rol{l}\t{%cl, $dst|$dst, CL}", + [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)]>; +def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst), + "rol{q}\t{%cl, $dst|$dst, %CL}", + [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)]>; +} +def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src1), + "rol{b}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)]>; +def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src1), + "rol{w}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)]>, + OpSize; +def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src1), + "rol{l}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)]>; +def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src1), + "rol{q}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)]>; + +// Rotate by 1 +def ROL8m1 : I<0xD0, MRM0m, (outs), (ins i8mem :$dst), + "rol{b}\t$dst", + [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; +def ROL16m1 : I<0xD1, MRM0m, (outs), (ins i16mem:$dst), + "rol{w}\t$dst", + [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize; +def ROL32m1 : I<0xD1, MRM0m, (outs), (ins i32mem:$dst), + "rol{l}\t$dst", + [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>; +def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst), + "rol{q}\t$dst", + [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>; + +let Constraints = "$src1 = $dst" in { +let Uses = [CL] in { +def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), + "ror{b}\t{%cl, $dst|$dst, CL}", + [(set GR8:$dst, (rotr GR8:$src1, CL))]>; +def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1), + "ror{w}\t{%cl, $dst|$dst, CL}", + [(set GR16:$dst, (rotr GR16:$src1, CL))]>, OpSize; +def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1), + "ror{l}\t{%cl, $dst|$dst, CL}", + [(set GR32:$dst, (rotr GR32:$src1, CL))]>; +def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1), + "ror{q}\t{%cl, $dst|$dst, %CL}", + [(set GR64:$dst, (rotr GR64:$src1, CL))]>; +} + +def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), + "ror{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))]>; +def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), + "ror{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))]>, + OpSize; +def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), + "ror{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))]>; +def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), + (ins GR64:$src1, i8imm:$src2), + "ror{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))]>; + +// Rotate by 1 +def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), + "ror{b}\t$dst", + [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>; +def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1), + "ror{w}\t$dst", + [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize; +def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1), + "ror{l}\t$dst", + [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>; +def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1), + "ror{q}\t$dst", + [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>; +} // Constraints = "$src = $dst" + +let Uses = [CL] in { +def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst), + "ror{b}\t{%cl, $dst|$dst, CL}", + [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)]>; +def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst), + "ror{w}\t{%cl, $dst|$dst, CL}", + [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize; +def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), + "ror{l}\t{%cl, $dst|$dst, CL}", + [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)]>; +def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), + "ror{q}\t{%cl, $dst|$dst, %CL}", + [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)]>; +} +def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src), + "ror{b}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src), + "ror{w}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize; +def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src), + "ror{l}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src), + "ror{q}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>; + +// Rotate by 1 +def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst), + "ror{b}\t$dst", + [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; +def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst), + "ror{w}\t$dst", + [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize; +def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst), + "ror{l}\t$dst", + [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>; +def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst), + "ror{q}\t$dst", + [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>; + + +//===----------------------------------------------------------------------===// +// Double shift instructions (generalizations of rotate) +//===----------------------------------------------------------------------===// + +let Constraints = "$src1 = $dst" in { + +let Uses = [CL] in { +def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2), + "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>, + TB, OpSize; +def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2), + "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>, + TB, OpSize; +def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2), + "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>, TB; +def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2), + "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>, TB; +def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}", + [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))]>, + TB; +def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}", + [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>, + TB; +} + +let isCommutable = 1 in { // These instructions commute to each other. +def SHLD16rri8 : Ii8<0xA4, MRMDestReg, + (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2, i8imm:$src3), + "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, + (i8 imm:$src3)))]>, + TB, OpSize; +def SHRD16rri8 : Ii8<0xAC, MRMDestReg, + (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2, i8imm:$src3), + "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, + (i8 imm:$src3)))]>, + TB, OpSize; +def SHLD32rri8 : Ii8<0xA4, MRMDestReg, + (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2, i8imm:$src3), + "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, + (i8 imm:$src3)))]>, + TB; +def SHRD32rri8 : Ii8<0xAC, MRMDestReg, + (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2, i8imm:$src3), + "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, + (i8 imm:$src3)))]>, + TB; +def SHLD64rri8 : RIi8<0xA4, MRMDestReg, + (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2, i8imm:$src3), + "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, + (i8 imm:$src3)))]>, + TB; +def SHRD64rri8 : RIi8<0xAC, MRMDestReg, + (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2, i8imm:$src3), + "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, + (i8 imm:$src3)))]>, + TB; +} +} // Constraints = "$src = $dst" + +let Uses = [CL] in { +def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL), + addr:$dst)]>, TB, OpSize; +def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL), + addr:$dst)]>, TB, OpSize; + +def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL), + addr:$dst)]>, TB; +def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}", + [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL), + addr:$dst)]>, TB; + +def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}", + [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL), + addr:$dst)]>, TB; +def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}", + [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL), + addr:$dst)]>, TB; +} + +def SHLD16mri8 : Ii8<0xA4, MRMDestMem, + (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3), + "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shld (loadi16 addr:$dst), GR16:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB, OpSize; +def SHRD16mri8 : Ii8<0xAC, MRMDestMem, + (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3), + "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB, OpSize; + +def SHLD32mri8 : Ii8<0xA4, MRMDestMem, + (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3), + "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shld (loadi32 addr:$dst), GR32:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB; +def SHRD32mri8 : Ii8<0xAC, MRMDestMem, + (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3), + "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB; + +def SHLD64mri8 : RIi8<0xA4, MRMDestMem, + (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3), + "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shld (loadi64 addr:$dst), GR64:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB; +def SHRD64mri8 : RIi8<0xAC, MRMDestMem, + (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3), + "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB; + +} // Defs = [EFLAGS] + diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td new file mode 100644 index 0000000..05a5b36 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td @@ -0,0 +1,467 @@ +//===- X86InstrSystem.td - System Instructions -------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 instructions that are generally used in +// privileged modes. These are not typically used by the compiler, but are +// supported for the assembler and disassembler. +// +//===----------------------------------------------------------------------===// + +let Defs = [RAX, RDX] in + def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>, TB; + +let Defs = [RAX, RCX, RDX] in + def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB; + +// CPU flow control instructions + +let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in { + def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB; + def UD2B : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB; +} + +def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>; +def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", []>, TB; + +// Interrupt and SysCall Instructions. +let Uses = [EFLAGS] in + def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>; +def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", + [(int_x86_int (i8 3))]>; + +// The long form of "int $3" turns into int3 as a size optimization. +// FIXME: This doesn't work because InstAlias can't match immediate constants. +//def : InstAlias<"int\t$3", (INT3)>; + + +def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap", + [(int_x86_int imm:$trap)]>; + + +def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB; +def SYSRETL : I<0x07, RawFrm, (outs), (ins), "sysretl", []>, TB; +def SYSRETQ :RI<0x07, RawFrm, (outs), (ins), "sysretq", []>, TB, + Requires<[In64BitMode]>; + +def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", []>, TB; + +def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit", []>, TB, + Requires<[In32BitMode]>; +def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit", []>, TB, + Requires<[In64BitMode]>; + +def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iretw", []>, OpSize; +def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", []>; +def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", []>, + Requires<[In64BitMode]>; + + +//===----------------------------------------------------------------------===// +// Input/Output Instructions. +// +let Defs = [AL], Uses = [DX] in +def IN8rr : I<0xEC, RawFrm, (outs), (ins), + "in{b}\t{%dx, %al|AL, DX}", []>; +let Defs = [AX], Uses = [DX] in +def IN16rr : I<0xED, RawFrm, (outs), (ins), + "in{w}\t{%dx, %ax|AX, DX}", []>, OpSize; +let Defs = [EAX], Uses = [DX] in +def IN32rr : I<0xED, RawFrm, (outs), (ins), + "in{l}\t{%dx, %eax|EAX, DX}", []>; + +let Defs = [AL] in +def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port), + "in{b}\t{$port, %al|AL, $port}", []>; +let Defs = [AX] in +def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), + "in{w}\t{$port, %ax|AX, $port}", []>, OpSize; +let Defs = [EAX] in +def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), + "in{l}\t{$port, %eax|EAX, $port}", []>; + +let Uses = [DX, AL] in +def OUT8rr : I<0xEE, RawFrm, (outs), (ins), + "out{b}\t{%al, %dx|DX, AL}", []>; +let Uses = [DX, AX] in +def OUT16rr : I<0xEF, RawFrm, (outs), (ins), + "out{w}\t{%ax, %dx|DX, AX}", []>, OpSize; +let Uses = [DX, EAX] in +def OUT32rr : I<0xEF, RawFrm, (outs), (ins), + "out{l}\t{%eax, %dx|DX, EAX}", []>; + +let Uses = [AL] in +def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port), + "out{b}\t{%al, $port|$port, AL}", []>; +let Uses = [AX] in +def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), + "out{w}\t{%ax, $port|$port, AX}", []>, OpSize; +let Uses = [EAX] in +def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), + "out{l}\t{%eax, $port|$port, EAX}", []>; + +def IN8 : I<0x6C, RawFrm, (outs), (ins), "ins{b}", []>; +def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", []>, OpSize; +def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", []>; + +//===----------------------------------------------------------------------===// +// Moves to and from debug registers + +def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, TB; +def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>, TB; + +def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, TB; +def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>, TB; + +//===----------------------------------------------------------------------===// +// Moves to and from control registers + +def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, TB; +def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>, TB; + +def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, TB; +def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>, TB; + +//===----------------------------------------------------------------------===// +// Segment override instruction prefixes + +def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>; +def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>; +def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>; +def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>; +def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>; +def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>; + + +//===----------------------------------------------------------------------===// +// Moves to and from segment registers. +// + +def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize; +def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>; +def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>; + +def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize; +def MOV32ms : I<0x8C, MRMDestMem, (outs i32mem:$dst), (ins SEGMENT_REG:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>; +def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>; + +def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize; +def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>; +def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>; + +def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize; +def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>; +def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>; + +//===----------------------------------------------------------------------===// +// Segmentation support instructions. + +def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", []>, TB; + +def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; + +// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo. +def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), + "lar{l}\t{$src, $dst|$dst, $src}", []>, TB; +def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "lar{l}\t{$src, $dst|$dst, $src}", []>, TB; +// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo. +def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "lar{q}\t{$src, $dst|$dst, $src}", []>, TB; +def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), + "lar{q}\t{$src, $dst|$dst, $src}", []>, TB; + +def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB; +def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB; +def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB; +def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB; + +def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB; + +def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins), + "str{w}\t{$dst}", []>, TB, OpSize; +def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins), + "str{l}\t{$dst}", []>, TB; +def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins), + "str{q}\t{$dst}", []>, TB; +def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins), + "str{w}\t{$dst}", []>, TB; + +def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src), + "ltr{w}\t{$src}", []>, TB; +def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), + "ltr{w}\t{$src}", []>, TB; + +def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins), + "push{w}\t{%cs|CS}", []>, Requires<[In32BitMode]>, OpSize; +def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins), + "push{l}\t{%cs|CS}", []>, Requires<[In32BitMode]>; +def PUSHSS16 : I<0x16, RawFrm, (outs), (ins), + "push{w}\t{%ss|SS}", []>, Requires<[In32BitMode]>, OpSize; +def PUSHSS32 : I<0x16, RawFrm, (outs), (ins), + "push{l}\t{%ss|SS}", []>, Requires<[In32BitMode]>; +def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins), + "push{w}\t{%ds|DS}", []>, Requires<[In32BitMode]>, OpSize; +def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins), + "push{l}\t{%ds|DS}", []>, Requires<[In32BitMode]>; +def PUSHES16 : I<0x06, RawFrm, (outs), (ins), + "push{w}\t{%es|ES}", []>, Requires<[In32BitMode]>, OpSize; +def PUSHES32 : I<0x06, RawFrm, (outs), (ins), + "push{l}\t{%es|ES}", []>, Requires<[In32BitMode]>; + +def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins), + "push{w}\t{%fs|FS}", []>, OpSize, TB; +def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins), + "push{l}\t{%fs|FS}", []>, TB, Requires<[In32BitMode]>; +def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins), + "push{w}\t{%gs|GS}", []>, OpSize, TB; +def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins), + "push{l}\t{%gs|GS}", []>, TB, Requires<[In32BitMode]>; + +def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins), + "push{q}\t{%fs|FS}", []>, TB; +def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins), + "push{q}\t{%gs|GS}", []>, TB; + +// No "pop cs" instruction. +def POPSS16 : I<0x17, RawFrm, (outs), (ins), + "pop{w}\t{%ss|SS}", []>, OpSize, Requires<[In32BitMode]>; +def POPSS32 : I<0x17, RawFrm, (outs), (ins), + "pop{l}\t{%ss|SS}", []> , Requires<[In32BitMode]>; + +def POPDS16 : I<0x1F, RawFrm, (outs), (ins), + "pop{w}\t{%ds|DS}", []>, OpSize, Requires<[In32BitMode]>; +def POPDS32 : I<0x1F, RawFrm, (outs), (ins), + "pop{l}\t{%ds|DS}", []> , Requires<[In32BitMode]>; + +def POPES16 : I<0x07, RawFrm, (outs), (ins), + "pop{w}\t{%es|ES}", []>, OpSize, Requires<[In32BitMode]>; +def POPES32 : I<0x07, RawFrm, (outs), (ins), + "pop{l}\t{%es|ES}", []> , Requires<[In32BitMode]>; + +def POPFS16 : I<0xa1, RawFrm, (outs), (ins), + "pop{w}\t{%fs|FS}", []>, OpSize, TB; +def POPFS32 : I<0xa1, RawFrm, (outs), (ins), + "pop{l}\t{%fs|FS}", []>, TB , Requires<[In32BitMode]>; +def POPFS64 : I<0xa1, RawFrm, (outs), (ins), + "pop{q}\t{%fs|FS}", []>, TB; + +def POPGS16 : I<0xa9, RawFrm, (outs), (ins), + "pop{w}\t{%gs|GS}", []>, OpSize, TB; +def POPGS32 : I<0xa9, RawFrm, (outs), (ins), + "pop{l}\t{%gs|GS}", []>, TB , Requires<[In32BitMode]>; +def POPGS64 : I<0xa9, RawFrm, (outs), (ins), + "pop{q}\t{%gs|GS}", []>, TB; + + +def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), + "lds{w}\t{$src, $dst|$dst, $src}", []>, OpSize; +def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), + "lds{l}\t{$src, $dst|$dst, $src}", []>; + +def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), + "lss{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), + "lss{l}\t{$src, $dst|$dst, $src}", []>, TB; +def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), + "lss{q}\t{$src, $dst|$dst, $src}", []>, TB; + +def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), + "les{w}\t{$src, $dst|$dst, $src}", []>, OpSize; +def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), + "les{l}\t{$src, $dst|$dst, $src}", []>; + +def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), + "lfs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), + "lfs{l}\t{$src, $dst|$dst, $src}", []>, TB; +def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), + "lfs{q}\t{$src, $dst|$dst, $src}", []>, TB; + +def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), + "lgs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; +def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), + "lgs{l}\t{$src, $dst|$dst, $src}", []>, TB; + +def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), + "lgs{q}\t{$src, $dst|$dst, $src}", []>, TB; + + +def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg), + "verr\t$seg", []>, TB; +def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg), + "verr\t$seg", []>, TB; +def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg), + "verw\t$seg", []>, TB; +def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg), + "verw\t$seg", []>, TB; + +//===----------------------------------------------------------------------===// +// Descriptor-table support instructions + +def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins), + "sgdtw\t$dst", []>, TB, OpSize, Requires<[In32BitMode]>; +def SGDTm : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins), + "sgdt\t$dst", []>, TB; +def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins), + "sidtw\t$dst", []>, TB, OpSize, Requires<[In32BitMode]>; +def SIDTm : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins), + "sidt\t$dst", []>, TB; +def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins), + "sldt{w}\t$dst", []>, TB, OpSize; +def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins), + "sldt{w}\t$dst", []>, TB; +def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins), + "sldt{l}\t$dst", []>, TB; + +// LLDT is not interpreted specially in 64-bit mode because there is no sign +// extension. +def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins), + "sldt{q}\t$dst", []>, TB; +def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins), + "sldt{q}\t$dst", []>, TB; + +def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src), + "lgdtw\t$src", []>, TB, OpSize, Requires<[In32BitMode]>; +def LGDTm : I<0x01, MRM2m, (outs), (ins opaque48mem:$src), + "lgdt\t$src", []>, TB; +def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src), + "lidtw\t$src", []>, TB, OpSize, Requires<[In32BitMode]>; +def LIDTm : I<0x01, MRM3m, (outs), (ins opaque48mem:$src), + "lidt\t$src", []>, TB; +def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src), + "lldt{w}\t$src", []>, TB; +def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src), + "lldt{w}\t$src", []>, TB; + +//===----------------------------------------------------------------------===// +// Specialized register support +def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", []>, TB; +def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB; +def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", []>, TB; + +def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), + "smsw{w}\t$dst", []>, OpSize, TB; +def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), + "smsw{l}\t$dst", []>, TB; +// no m form encodable; use SMSW16m +def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), + "smsw{q}\t$dst", []>, TB; + +// For memory operands, there is only a 16-bit form +def SMSW16m : I<0x01, MRM4m, (outs i16mem:$dst), (ins), + "smsw{w}\t$dst", []>, TB; + +def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src), + "lmsw{w}\t$src", []>, TB; +def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src), + "lmsw{w}\t$src", []>, TB; + +def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", []>, TB; + +//===----------------------------------------------------------------------===// +// Cache instructions +def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB; +def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", []>, TB; + +//===----------------------------------------------------------------------===// +// XSAVE instructions +let Defs = [RDX, RAX], Uses = [RCX] in + def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB; + +let Uses = [RDX, RAX, RCX] in + def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB; + +let Uses = [RDX, RAX] in { + def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), + "xsave\t$dst", []>, TB; + def XSAVE64 : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), + "xsaveq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>; + def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), + "xrstor\t$dst", []>, TB; + def XRSTOR64 : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), + "xrstorq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>; + def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), + "xsaveopt\t$dst", []>, TB; + def XSAVEOPT64 : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), + "xsaveoptq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>; +} + +//===----------------------------------------------------------------------===// +// VIA PadLock crypto instructions +let Defs = [RAX, RDI], Uses = [RDX, RDI] in + def XSTORE : I<0xc0, RawFrm, (outs), (ins), "xstore", []>, A7; + +def : InstAlias<"xstorerng", (XSTORE)>; + +let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in { + def XCRYPTECB : I<0xc8, RawFrm, (outs), (ins), "xcryptecb", []>, A7; + def XCRYPTCBC : I<0xd0, RawFrm, (outs), (ins), "xcryptcbc", []>, A7; + def XCRYPTCTR : I<0xd8, RawFrm, (outs), (ins), "xcryptctr", []>, A7; + def XCRYPTCFB : I<0xe0, RawFrm, (outs), (ins), "xcryptcfb", []>, A7; + def XCRYPTOFB : I<0xe8, RawFrm, (outs), (ins), "xcryptofb", []>, A7; +} + +let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in { + def XSHA1 : I<0xc8, RawFrm, (outs), (ins), "xsha1", []>, A6; + def XSHA256 : I<0xd0, RawFrm, (outs), (ins), "xsha256", []>, A6; +} +let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in + def MONTMUL : I<0xc0, RawFrm, (outs), (ins), "montmul", []>, A6; + +//===----------------------------------------------------------------------===// +// FS/GS Base Instructions +let Predicates = [In64BitMode] in { + def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins), + "rdfsbase{l}\t$dst", []>, TB, XS; + def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins), + "rdfsbase{q}\t$dst", []>, TB, XS; + def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins), + "rdgsbase{l}\t$dst", []>, TB, XS; + def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins), + "rdgsbase{q}\t$dst", []>, TB, XS; + def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$dst), + "wrfsbase{l}\t$dst", []>, TB, XS; + def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$dst), + "wrfsbase{q}\t$dst", []>, TB, XS; + def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$dst), + "wrgsbase{l}\t$dst", []>, TB, XS; + def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$dst), + "wrgsbase{q}\t$dst", []>, TB, XS; +} diff --git a/contrib/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm/lib/Target/X86/X86InstrVMX.td new file mode 100644 index 0000000..09a7a7d0c --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrVMX.td @@ -0,0 +1,60 @@ +//===- X86InstrVMX.td - VMX Instruction Set Extension ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel VMX instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// VMX instructions + +// 66 0F 38 80 +def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), + "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8; +def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), + "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8; +// 66 0F 38 81 +def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), + "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8; +def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), + "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8; +// 0F 01 C1 +def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB; +def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs), + "vmclear\t$vmcs", []>, OpSize, TB; +// 0F 01 C2 +def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB; +// 0F 01 C3 +def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB; +def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs), + "vmptrld\t$vmcs", []>, TB; +def VMPTRSTm : I<0xC7, MRM7m, (outs i64mem:$vmcs), (ins), + "vmptrst\t$vmcs", []>, TB; +def VMREAD64rm : I<0x78, MRMDestMem, (outs i64mem:$dst), (ins GR64:$src), + "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB; +def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB; +def VMREAD32rm : I<0x78, MRMDestMem, (outs i32mem:$dst), (ins GR32:$src), + "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB; +def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB; +def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB; +def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB; +def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB; +def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB; +// 0F 01 C4 +def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB; +def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon), + "vmxon\t{$vmxon}", []>, XS; + diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.cpp b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp new file mode 100644 index 0000000..3f88fa6 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp @@ -0,0 +1,574 @@ +//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the JIT interfaces for the X86 target. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "jit" +#include "X86JITInfo.h" +#include "X86Relocations.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/Function.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Valgrind.h" +#include <cstdlib> +#include <cstring> +using namespace llvm; + +// Determine the platform we're running on +#if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64) +# define X86_64_JIT +#elif defined(__i386__) || defined(i386) || defined(_M_IX86) +# define X86_32_JIT +#endif + +void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) { + unsigned char *OldByte = (unsigned char *)Old; + *OldByte++ = 0xE9; // Emit JMP opcode. + unsigned *OldWord = (unsigned *)OldByte; + unsigned NewAddr = (intptr_t)New; + unsigned OldAddr = (intptr_t)OldWord; + *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code. + + // X86 doesn't need to invalidate the processor cache, so just invalidate + // Valgrind's cache directly. + sys::ValgrindDiscardTranslations(Old, 5); +} + + +/// JITCompilerFunction - This contains the address of the JIT function used to +/// compile a function lazily. +static TargetJITInfo::JITCompilerFn JITCompilerFunction; + +// Get the ASMPREFIX for the current host. This is often '_'. +#ifndef __USER_LABEL_PREFIX__ +#define __USER_LABEL_PREFIX__ +#endif +#define GETASMPREFIX2(X) #X +#define GETASMPREFIX(X) GETASMPREFIX2(X) +#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__) + +// For ELF targets, use a .size and .type directive, to let tools +// know the extent of functions defined in assembler. +#if defined(__ELF__) +# define SIZE(sym) ".size " #sym ", . - " #sym "\n" +# define TYPE_FUNCTION(sym) ".type " #sym ", @function\n" +#else +# define SIZE(sym) +# define TYPE_FUNCTION(sym) +#endif + +// Provide a convenient way for disabling usage of CFI directives. +// This is needed for old/broken assemblers (for example, gas on +// Darwin is pretty old and doesn't support these directives) +#if defined(__APPLE__) +# define CFI(x) +#else +// FIXME: Disable this until we really want to use it. Also, we will +// need to add some workarounds for compilers, which support +// only subset of these directives. +# define CFI(x) +#endif + +// Provide a wrapper for X86CompilationCallback2 that saves non-traditional +// callee saved registers, for the fastcc calling convention. +extern "C" { +#if defined(X86_64_JIT) +# ifndef _MSC_VER + // No need to save EAX/EDX for X86-64. + void X86CompilationCallback(void); + asm( + ".text\n" + ".align 8\n" + ".globl " ASMPREFIX "X86CompilationCallback\n" + TYPE_FUNCTION(X86CompilationCallback) + ASMPREFIX "X86CompilationCallback:\n" + CFI(".cfi_startproc\n") + // Save RBP + "pushq %rbp\n" + CFI(".cfi_def_cfa_offset 16\n") + CFI(".cfi_offset %rbp, -16\n") + // Save RSP + "movq %rsp, %rbp\n" + CFI(".cfi_def_cfa_register %rbp\n") + // Save all int arg registers + "pushq %rdi\n" + CFI(".cfi_rel_offset %rdi, 0\n") + "pushq %rsi\n" + CFI(".cfi_rel_offset %rsi, 8\n") + "pushq %rdx\n" + CFI(".cfi_rel_offset %rdx, 16\n") + "pushq %rcx\n" + CFI(".cfi_rel_offset %rcx, 24\n") + "pushq %r8\n" + CFI(".cfi_rel_offset %r8, 32\n") + "pushq %r9\n" + CFI(".cfi_rel_offset %r9, 40\n") + // Align stack on 16-byte boundary. ESP might not be properly aligned + // (8 byte) if this is called from an indirect stub. + "andq $-16, %rsp\n" + // Save all XMM arg registers + "subq $128, %rsp\n" + "movaps %xmm0, (%rsp)\n" + "movaps %xmm1, 16(%rsp)\n" + "movaps %xmm2, 32(%rsp)\n" + "movaps %xmm3, 48(%rsp)\n" + "movaps %xmm4, 64(%rsp)\n" + "movaps %xmm5, 80(%rsp)\n" + "movaps %xmm6, 96(%rsp)\n" + "movaps %xmm7, 112(%rsp)\n" + // JIT callee +#ifdef _WIN64 + "subq $32, %rsp\n" + "movq %rbp, %rcx\n" // Pass prev frame and return address + "movq 8(%rbp), %rdx\n" + "call " ASMPREFIX "X86CompilationCallback2\n" + "addq $32, %rsp\n" +#else + "movq %rbp, %rdi\n" // Pass prev frame and return address + "movq 8(%rbp), %rsi\n" + "call " ASMPREFIX "X86CompilationCallback2\n" +#endif + // Restore all XMM arg registers + "movaps 112(%rsp), %xmm7\n" + "movaps 96(%rsp), %xmm6\n" + "movaps 80(%rsp), %xmm5\n" + "movaps 64(%rsp), %xmm4\n" + "movaps 48(%rsp), %xmm3\n" + "movaps 32(%rsp), %xmm2\n" + "movaps 16(%rsp), %xmm1\n" + "movaps (%rsp), %xmm0\n" + // Restore RSP + "movq %rbp, %rsp\n" + CFI(".cfi_def_cfa_register %rsp\n") + // Restore all int arg registers + "subq $48, %rsp\n" + CFI(".cfi_adjust_cfa_offset 48\n") + "popq %r9\n" + CFI(".cfi_adjust_cfa_offset -8\n") + CFI(".cfi_restore %r9\n") + "popq %r8\n" + CFI(".cfi_adjust_cfa_offset -8\n") + CFI(".cfi_restore %r8\n") + "popq %rcx\n" + CFI(".cfi_adjust_cfa_offset -8\n") + CFI(".cfi_restore %rcx\n") + "popq %rdx\n" + CFI(".cfi_adjust_cfa_offset -8\n") + CFI(".cfi_restore %rdx\n") + "popq %rsi\n" + CFI(".cfi_adjust_cfa_offset -8\n") + CFI(".cfi_restore %rsi\n") + "popq %rdi\n" + CFI(".cfi_adjust_cfa_offset -8\n") + CFI(".cfi_restore %rdi\n") + // Restore RBP + "popq %rbp\n" + CFI(".cfi_adjust_cfa_offset -8\n") + CFI(".cfi_restore %rbp\n") + "ret\n" + CFI(".cfi_endproc\n") + SIZE(X86CompilationCallback) + ); +# else + // No inline assembler support on this platform. The routine is in external + // file. + void X86CompilationCallback(); + +# endif +#elif defined (X86_32_JIT) +# ifndef _MSC_VER + void X86CompilationCallback(void); + asm( + ".text\n" + ".align 8\n" + ".globl " ASMPREFIX "X86CompilationCallback\n" + TYPE_FUNCTION(X86CompilationCallback) + ASMPREFIX "X86CompilationCallback:\n" + CFI(".cfi_startproc\n") + "pushl %ebp\n" + CFI(".cfi_def_cfa_offset 8\n") + CFI(".cfi_offset %ebp, -8\n") + "movl %esp, %ebp\n" // Standard prologue + CFI(".cfi_def_cfa_register %ebp\n") + "pushl %eax\n" + CFI(".cfi_rel_offset %eax, 0\n") + "pushl %edx\n" // Save EAX/EDX/ECX + CFI(".cfi_rel_offset %edx, 4\n") + "pushl %ecx\n" + CFI(".cfi_rel_offset %ecx, 8\n") +# if defined(__APPLE__) + "andl $-16, %esp\n" // Align ESP on 16-byte boundary +# endif + "subl $16, %esp\n" + "movl 4(%ebp), %eax\n" // Pass prev frame and return address + "movl %eax, 4(%esp)\n" + "movl %ebp, (%esp)\n" + "call " ASMPREFIX "X86CompilationCallback2\n" + "movl %ebp, %esp\n" // Restore ESP + CFI(".cfi_def_cfa_register %esp\n") + "subl $12, %esp\n" + CFI(".cfi_adjust_cfa_offset 12\n") + "popl %ecx\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %ecx\n") + "popl %edx\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %edx\n") + "popl %eax\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %eax\n") + "popl %ebp\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %ebp\n") + "ret\n" + CFI(".cfi_endproc\n") + SIZE(X86CompilationCallback) + ); + + // Same as X86CompilationCallback but also saves XMM argument registers. + void X86CompilationCallback_SSE(void); + asm( + ".text\n" + ".align 8\n" + ".globl " ASMPREFIX "X86CompilationCallback_SSE\n" + TYPE_FUNCTION(X86CompilationCallback_SSE) + ASMPREFIX "X86CompilationCallback_SSE:\n" + CFI(".cfi_startproc\n") + "pushl %ebp\n" + CFI(".cfi_def_cfa_offset 8\n") + CFI(".cfi_offset %ebp, -8\n") + "movl %esp, %ebp\n" // Standard prologue + CFI(".cfi_def_cfa_register %ebp\n") + "pushl %eax\n" + CFI(".cfi_rel_offset %eax, 0\n") + "pushl %edx\n" // Save EAX/EDX/ECX + CFI(".cfi_rel_offset %edx, 4\n") + "pushl %ecx\n" + CFI(".cfi_rel_offset %ecx, 8\n") + "andl $-16, %esp\n" // Align ESP on 16-byte boundary + // Save all XMM arg registers + "subl $64, %esp\n" + // FIXME: provide frame move information for xmm registers. + // This can be tricky, because CFA register is ebp (unaligned) + // and we need to produce offsets relative to it. + "movaps %xmm0, (%esp)\n" + "movaps %xmm1, 16(%esp)\n" + "movaps %xmm2, 32(%esp)\n" + "movaps %xmm3, 48(%esp)\n" + "subl $16, %esp\n" + "movl 4(%ebp), %eax\n" // Pass prev frame and return address + "movl %eax, 4(%esp)\n" + "movl %ebp, (%esp)\n" + "call " ASMPREFIX "X86CompilationCallback2\n" + "addl $16, %esp\n" + "movaps 48(%esp), %xmm3\n" + CFI(".cfi_restore %xmm3\n") + "movaps 32(%esp), %xmm2\n" + CFI(".cfi_restore %xmm2\n") + "movaps 16(%esp), %xmm1\n" + CFI(".cfi_restore %xmm1\n") + "movaps (%esp), %xmm0\n" + CFI(".cfi_restore %xmm0\n") + "movl %ebp, %esp\n" // Restore ESP + CFI(".cfi_def_cfa_register esp\n") + "subl $12, %esp\n" + CFI(".cfi_adjust_cfa_offset 12\n") + "popl %ecx\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %ecx\n") + "popl %edx\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %edx\n") + "popl %eax\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %eax\n") + "popl %ebp\n" + CFI(".cfi_adjust_cfa_offset -4\n") + CFI(".cfi_restore %ebp\n") + "ret\n" + CFI(".cfi_endproc\n") + SIZE(X86CompilationCallback_SSE) + ); +# else + void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr); + + _declspec(naked) void X86CompilationCallback(void) { + __asm { + push ebp + mov ebp, esp + push eax + push edx + push ecx + and esp, -16 + sub esp, 16 + mov eax, dword ptr [ebp+4] + mov dword ptr [esp+4], eax + mov dword ptr [esp], ebp + call X86CompilationCallback2 + mov esp, ebp + sub esp, 12 + pop ecx + pop edx + pop eax + pop ebp + ret + } + } + +# endif // _MSC_VER + +#else // Not an i386 host + void X86CompilationCallback() { + llvm_unreachable("Cannot call X86CompilationCallback() on a non-x86 arch!"); + } +#endif +} + +/// X86CompilationCallback2 - This is the target-specific function invoked by the +/// function stub when we did not know the real target of a call. This function +/// must locate the start of the stub or call site and pass it into the JIT +/// compiler function. +extern "C" { +#if !(defined (X86_64_JIT) && defined(_MSC_VER)) + // the following function is called only from this translation unit, + // unless we are under 64bit Windows with MSC, where there is + // no support for inline assembly +static +#endif +void LLVM_ATTRIBUTE_USED +X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) { + intptr_t *RetAddrLoc = &StackPtr[1]; + assert(*RetAddrLoc == RetAddr && + "Could not find return address on the stack!"); + + // It's a stub if there is an interrupt marker after the call. + bool isStub = ((unsigned char*)RetAddr)[0] == 0xCE; + + // The call instruction should have pushed the return value onto the stack... +#if defined (X86_64_JIT) + RetAddr--; // Backtrack to the reference itself... +#else + RetAddr -= 4; // Backtrack to the reference itself... +#endif + +#if 0 + DEBUG(dbgs() << "In callback! Addr=" << (void*)RetAddr + << " ESP=" << (void*)StackPtr + << ": Resolving call to function: " + << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n"); +#endif + + // Sanity check to make sure this really is a call instruction. +#if defined (X86_64_JIT) + assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!"); + assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!"); +#else + assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!"); +#endif + + intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr); + + // Rewrite the call target... so that we don't end up here every time we + // execute the call. +#if defined (X86_64_JIT) + assert(isStub && + "X86-64 doesn't support rewriting non-stub lazy compilation calls:" + " the call instruction varies too much."); +#else + *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4); +#endif + + if (isStub) { + // If this is a stub, rewrite the call into an unconditional branch + // instruction so that two return addresses are not pushed onto the stack + // when the requested function finally gets called. This also makes the + // 0xCE byte (interrupt) dead, so the marker doesn't effect anything. +#if defined (X86_64_JIT) + // If the target address is within 32-bit range of the stub, use a + // PC-relative branch instead of loading the actual address. (This is + // considerably shorter than the 64-bit immediate load already there.) + // We assume here intptr_t is 64 bits. + intptr_t diff = NewVal-RetAddr+7; + if (diff >= -2147483648LL && diff <= 2147483647LL) { + *(unsigned char*)(RetAddr-0xc) = 0xE9; + *(intptr_t *)(RetAddr-0xb) = diff & 0xffffffff; + } else { + *(intptr_t *)(RetAddr - 0xa) = NewVal; + ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6)); + } + sys::ValgrindDiscardTranslations((void*)(RetAddr-0xc), 0xd); +#else + ((unsigned char*)RetAddr)[-1] = 0xE9; + sys::ValgrindDiscardTranslations((void*)(RetAddr-1), 5); +#endif + } + + // Change the return address to reexecute the call instruction... +#if defined (X86_64_JIT) + *RetAddrLoc -= 0xd; +#else + *RetAddrLoc -= 5; +#endif +} +} + +TargetJITInfo::LazyResolverFn +X86JITInfo::getLazyResolverFunction(JITCompilerFn F) { + JITCompilerFunction = F; + +#if defined (X86_32_JIT) && !defined (_MSC_VER) + if (Subtarget->hasSSE1()) + return X86CompilationCallback_SSE; +#endif + + return X86CompilationCallback; +} + +X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) { + Subtarget = &TM.getSubtarget<X86Subtarget>(); + useGOT = 0; + TLSOffset = 0; +} + +void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr, + JITCodeEmitter &JCE) { +#if defined (X86_64_JIT) + const unsigned Alignment = 8; + uint8_t Buffer[8]; + uint8_t *Cur = Buffer; + MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(intptr_t)ptr); + MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(((intptr_t)ptr) >> 32)); +#else + const unsigned Alignment = 4; + uint8_t Buffer[4]; + uint8_t *Cur = Buffer; + MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)ptr); +#endif + return JCE.allocIndirectGV(GV, Buffer, sizeof(Buffer), Alignment); +} + +TargetJITInfo::StubLayout X86JITInfo::getStubLayout() { + // The 64-bit stub contains: + // movabs r10 <- 8-byte-target-address # 10 bytes + // call|jmp *r10 # 3 bytes + // The 32-bit stub contains a 5-byte call|jmp. + // If the stub is a call to the compilation callback, an extra byte is added + // to mark it as a stub. + StubLayout Result = {14, 4}; + return Result; +} + +void *X86JITInfo::emitFunctionStub(const Function* F, void *Target, + JITCodeEmitter &JCE) { + // Note, we cast to intptr_t here to silence a -pedantic warning that + // complains about casting a function pointer to a normal pointer. +#if defined (X86_32_JIT) && !defined (_MSC_VER) + bool NotCC = (Target != (void*)(intptr_t)X86CompilationCallback && + Target != (void*)(intptr_t)X86CompilationCallback_SSE); +#else + bool NotCC = Target != (void*)(intptr_t)X86CompilationCallback; +#endif + JCE.emitAlignment(4); + void *Result = (void*)JCE.getCurrentPCValue(); + if (NotCC) { +#if defined (X86_64_JIT) + JCE.emitByte(0x49); // REX prefix + JCE.emitByte(0xB8+2); // movabsq r10 + JCE.emitWordLE((unsigned)(intptr_t)Target); + JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32)); + JCE.emitByte(0x41); // REX prefix + JCE.emitByte(0xFF); // jmpq *r10 + JCE.emitByte(2 | (4 << 3) | (3 << 6)); +#else + JCE.emitByte(0xE9); + JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4); +#endif + return Result; + } + +#if defined (X86_64_JIT) + JCE.emitByte(0x49); // REX prefix + JCE.emitByte(0xB8+2); // movabsq r10 + JCE.emitWordLE((unsigned)(intptr_t)Target); + JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32)); + JCE.emitByte(0x41); // REX prefix + JCE.emitByte(0xFF); // callq *r10 + JCE.emitByte(2 | (2 << 3) | (3 << 6)); +#else + JCE.emitByte(0xE8); // Call with 32 bit pc-rel destination... + + JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4); +#endif + + // This used to use 0xCD, but that value is used by JITMemoryManager to + // initialize the buffer with garbage, which means it may follow a + // noreturn function call, confusing X86CompilationCallback2. PR 4929. + JCE.emitByte(0xCE); // Interrupt - Just a marker identifying the stub! + return Result; +} + +/// getPICJumpTableEntry - Returns the value of the jumptable entry for the +/// specific basic block. +uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) { +#if defined(X86_64_JIT) + return BB - Entry; +#else + return BB - PICBase; +#endif +} + +/// relocate - Before the JIT can run a block of code that has been emitted, +/// it must rewrite the code to contain the actual addresses of any +/// referenced global symbols. +void X86JITInfo::relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase) { + for (unsigned i = 0; i != NumRelocs; ++i, ++MR) { + void *RelocPos = (char*)Function + MR->getMachineCodeOffset(); + intptr_t ResultPtr = (intptr_t)MR->getResultPointer(); + switch ((X86::RelocationType)MR->getRelocationType()) { + case X86::reloc_pcrel_word: { + // PC relative relocation, add the relocated value to the value already in + // memory, after we adjust it for where the PC is. + ResultPtr = ResultPtr -(intptr_t)RelocPos - 4 - MR->getConstantVal(); + *((unsigned*)RelocPos) += (unsigned)ResultPtr; + break; + } + case X86::reloc_picrel_word: { + // PIC base relative relocation, add the relocated value to the value + // already in memory, after we adjust it for where the PIC base is. + ResultPtr = ResultPtr - ((intptr_t)Function + MR->getConstantVal()); + *((unsigned*)RelocPos) += (unsigned)ResultPtr; + break; + } + case X86::reloc_absolute_word: + case X86::reloc_absolute_word_sext: + // Absolute relocation, just add the relocated value to the value already + // in memory. + *((unsigned*)RelocPos) += (unsigned)ResultPtr; + break; + case X86::reloc_absolute_dword: + *((intptr_t*)RelocPos) += ResultPtr; + break; + } + } +} + +char* X86JITInfo::allocateThreadLocalMemory(size_t size) { +#if defined(X86_32_JIT) && !defined(__APPLE__) && !defined(_MSC_VER) + TLSOffset -= size; + return TLSOffset; +#else + llvm_unreachable("Cannot allocate thread local storage on this arch!"); + return 0; +#endif +} diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.h b/contrib/llvm/lib/Target/X86/X86JITInfo.h new file mode 100644 index 0000000..238420c --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86JITInfo.h @@ -0,0 +1,81 @@ +//===- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetJITInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86JITINFO_H +#define X86JITINFO_H + +#include "llvm/Function.h" +#include "llvm/CodeGen/JITCodeEmitter.h" +#include "llvm/Target/TargetJITInfo.h" + +namespace llvm { + class X86TargetMachine; + class X86Subtarget; + + class X86JITInfo : public TargetJITInfo { + X86TargetMachine &TM; + const X86Subtarget *Subtarget; + uintptr_t PICBase; + char* TLSOffset; + public: + explicit X86JITInfo(X86TargetMachine &tm); + + /// replaceMachineCodeForFunction - Make it so that calling the function + /// whose machine code is at OLD turns into a call to NEW, perhaps by + /// overwriting OLD with a branch to NEW. This is used for self-modifying + /// code. + /// + virtual void replaceMachineCodeForFunction(void *Old, void *New); + + /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object + /// to emit an indirect symbol which contains the address of the specified + /// ptr. + virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr, + JITCodeEmitter &JCE); + + // getStubLayout - Returns the size and alignment of the largest call stub + // on X86. + virtual StubLayout getStubLayout(); + + /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a + /// small native function that simply calls the function at the specified + /// address. + virtual void *emitFunctionStub(const Function* F, void *Target, + JITCodeEmitter &JCE); + + /// getPICJumpTableEntry - Returns the value of the jumptable entry for the + /// specific basic block. + virtual uintptr_t getPICJumpTableEntry(uintptr_t BB, uintptr_t JTBase); + + /// getLazyResolverFunction - Expose the lazy resolver to the JIT. + virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn); + + /// relocate - Before the JIT can run a block of code that has been emitted, + /// it must rewrite the code to contain the actual addresses of any + /// referenced global symbols. + virtual void relocate(void *Function, MachineRelocation *MR, + unsigned NumRelocs, unsigned char* GOTBase); + + /// allocateThreadLocalMemory - Each target has its own way of + /// handling thread local variables. This method returns a value only + /// meaningful to the target. + virtual char* allocateThreadLocalMemory(size_t size); + + /// setPICBase / getPICBase - Getter / setter of PICBase, used to compute + /// PIC jumptable entry. + void setPICBase(uintptr_t Base) { PICBase = Base; } + uintptr_t getPICBase() const { return PICBase; } + }; +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp new file mode 100644 index 0000000..50bc14d --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -0,0 +1,701 @@ +//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains code to lower X86 MachineInstrs to their corresponding +// MCInst records. +// +//===----------------------------------------------------------------------===// + +#include "InstPrinter/X86ATTInstPrinter.h" +#include "X86MCInstLower.h" +#include "X86AsmPrinter.h" +#include "X86COFFMachineModuleInfo.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Target/Mangler.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Type.h" +using namespace llvm; + +X86MCInstLower::X86MCInstLower(Mangler *mang, const MachineFunction &mf, + X86AsmPrinter &asmprinter) +: Ctx(mf.getContext()), Mang(mang), MF(mf), TM(mf.getTarget()), + MAI(*TM.getMCAsmInfo()), AsmPrinter(asmprinter) {} + +MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { + return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>(); +} + + +/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol +/// operand to an MCSymbol. +MCSymbol *X86MCInstLower:: +GetSymbolFromOperand(const MachineOperand &MO) const { + assert((MO.isGlobal() || MO.isSymbol()) && "Isn't a symbol reference"); + + SmallString<128> Name; + + if (!MO.isGlobal()) { + assert(MO.isSymbol()); + Name += MAI.getGlobalPrefix(); + Name += MO.getSymbolName(); + } else { + const GlobalValue *GV = MO.getGlobal(); + bool isImplicitlyPrivate = false; + if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB || + MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY || + MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE || + MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE) + isImplicitlyPrivate = true; + + Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate); + } + + // If the target flags on the operand changes the name of the symbol, do that + // before we return the symbol. + switch (MO.getTargetFlags()) { + default: break; + case X86II::MO_DLLIMPORT: { + // Handle dllimport linkage. + const char *Prefix = "__imp_"; + Name.insert(Name.begin(), Prefix, Prefix+strlen(Prefix)); + break; + } + case X86II::MO_DARWIN_NONLAZY: + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: { + Name += "$non_lazy_ptr"; + MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str()); + + MachineModuleInfoImpl::StubValueTy &StubSym = + getMachOMMI().getGVStubEntry(Sym); + if (StubSym.getPointer() == 0) { + assert(MO.isGlobal() && "Extern symbol not handled yet"); + StubSym = + MachineModuleInfoImpl:: + StubValueTy(Mang->getSymbol(MO.getGlobal()), + !MO.getGlobal()->hasInternalLinkage()); + } + return Sym; + } + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: { + Name += "$non_lazy_ptr"; + MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str()); + MachineModuleInfoImpl::StubValueTy &StubSym = + getMachOMMI().getHiddenGVStubEntry(Sym); + if (StubSym.getPointer() == 0) { + assert(MO.isGlobal() && "Extern symbol not handled yet"); + StubSym = + MachineModuleInfoImpl:: + StubValueTy(Mang->getSymbol(MO.getGlobal()), + !MO.getGlobal()->hasInternalLinkage()); + } + return Sym; + } + case X86II::MO_DARWIN_STUB: { + Name += "$stub"; + MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str()); + MachineModuleInfoImpl::StubValueTy &StubSym = + getMachOMMI().getFnStubEntry(Sym); + if (StubSym.getPointer()) + return Sym; + + if (MO.isGlobal()) { + StubSym = + MachineModuleInfoImpl:: + StubValueTy(Mang->getSymbol(MO.getGlobal()), + !MO.getGlobal()->hasInternalLinkage()); + } else { + Name.erase(Name.end()-5, Name.end()); + StubSym = + MachineModuleInfoImpl:: + StubValueTy(Ctx.GetOrCreateSymbol(Name.str()), false); + } + return Sym; + } + } + + return Ctx.GetOrCreateSymbol(Name.str()); +} + +MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, + MCSymbol *Sym) const { + // FIXME: We would like an efficient form for this, so we don't have to do a + // lot of extra uniquing. + const MCExpr *Expr = 0; + MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; + + switch (MO.getTargetFlags()) { + default: llvm_unreachable("Unknown target flag on GV operand"); + case X86II::MO_NO_FLAG: // No flag. + // These affect the name of the symbol, not any suffix. + case X86II::MO_DARWIN_NONLAZY: + case X86II::MO_DLLIMPORT: + case X86II::MO_DARWIN_STUB: + break; + + case X86II::MO_TLVP: RefKind = MCSymbolRefExpr::VK_TLVP; break; + case X86II::MO_TLVP_PIC_BASE: + Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx); + // Subtract the pic base. + Expr = MCBinaryExpr::CreateSub(Expr, + MCSymbolRefExpr::Create(MF.getPICBaseSymbol(), + Ctx), + Ctx); + break; + case X86II::MO_TLSGD: RefKind = MCSymbolRefExpr::VK_TLSGD; break; + case X86II::MO_GOTTPOFF: RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break; + case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break; + case X86II::MO_TPOFF: RefKind = MCSymbolRefExpr::VK_TPOFF; break; + case X86II::MO_NTPOFF: RefKind = MCSymbolRefExpr::VK_NTPOFF; break; + case X86II::MO_GOTPCREL: RefKind = MCSymbolRefExpr::VK_GOTPCREL; break; + case X86II::MO_GOT: RefKind = MCSymbolRefExpr::VK_GOT; break; + case X86II::MO_GOTOFF: RefKind = MCSymbolRefExpr::VK_GOTOFF; break; + case X86II::MO_PLT: RefKind = MCSymbolRefExpr::VK_PLT; break; + case X86II::MO_PIC_BASE_OFFSET: + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: + case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: + Expr = MCSymbolRefExpr::Create(Sym, Ctx); + // Subtract the pic base. + Expr = MCBinaryExpr::CreateSub(Expr, + MCSymbolRefExpr::Create(MF.getPICBaseSymbol(), Ctx), + Ctx); + if (MO.isJTI() && MAI.hasSetDirective()) { + // If .set directive is supported, use it to reduce the number of + // relocations the assembler will generate for differences between + // local labels. This is only safe when the symbols are in the same + // section so we are restricting it to jumptable references. + MCSymbol *Label = Ctx.CreateTempSymbol(); + AsmPrinter.OutStreamer.EmitAssignment(Label, Expr); + Expr = MCSymbolRefExpr::Create(Label, Ctx); + } + break; + } + + if (Expr == 0) + Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx); + + if (!MO.isJTI() && MO.getOffset()) + Expr = MCBinaryExpr::CreateAdd(Expr, + MCConstantExpr::Create(MO.getOffset(), Ctx), + Ctx); + return MCOperand::CreateExpr(Expr); +} + + + +static void lower_subreg32(MCInst *MI, unsigned OpNo) { + // Convert registers in the addr mode according to subreg32. + unsigned Reg = MI->getOperand(OpNo).getReg(); + if (Reg != 0) + MI->getOperand(OpNo).setReg(getX86SubSuperRegister(Reg, MVT::i32)); +} + +static void lower_lea64_32mem(MCInst *MI, unsigned OpNo) { + // Convert registers in the addr mode according to subreg64. + for (unsigned i = 0; i != 4; ++i) { + if (!MI->getOperand(OpNo+i).isReg()) continue; + + unsigned Reg = MI->getOperand(OpNo+i).getReg(); + if (Reg == 0) continue; + + MI->getOperand(OpNo+i).setReg(getX86SubSuperRegister(Reg, MVT::i64)); + } +} + +/// LowerSubReg32_Op0 - Things like MOVZX16rr8 -> MOVZX32rr8. +static void LowerSubReg32_Op0(MCInst &OutMI, unsigned NewOpc) { + OutMI.setOpcode(NewOpc); + lower_subreg32(&OutMI, 0); +} +/// LowerUnaryToTwoAddr - R = setb -> R = sbb R, R +static void LowerUnaryToTwoAddr(MCInst &OutMI, unsigned NewOpc) { + OutMI.setOpcode(NewOpc); + OutMI.addOperand(OutMI.getOperand(0)); + OutMI.addOperand(OutMI.getOperand(0)); +} + +/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with +/// a short fixed-register form. +static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) { + unsigned ImmOp = Inst.getNumOperands() - 1; + assert(Inst.getOperand(0).isReg() && Inst.getOperand(ImmOp).isImm() && + ((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() && + Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) || + Inst.getNumOperands() == 2) && "Unexpected instruction!"); + + // Check whether the destination register can be fixed. + unsigned Reg = Inst.getOperand(0).getReg(); + if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX) + return; + + // If so, rewrite the instruction. + MCOperand Saved = Inst.getOperand(ImmOp); + Inst = MCInst(); + Inst.setOpcode(Opcode); + Inst.addOperand(Saved); +} + +/// \brief Simplify things like MOV32rm to MOV32o32a. +static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst, + unsigned Opcode) { + // Don't make these simplifications in 64-bit mode; other assemblers don't + // perform them because they make the code larger. + if (Printer.getSubtarget().is64Bit()) + return; + + bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg(); + unsigned AddrBase = IsStore; + unsigned RegOp = IsStore ? 0 : 5; + unsigned AddrOp = AddrBase + 3; + assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() && + Inst.getOperand(AddrBase + 0).isReg() && // base + Inst.getOperand(AddrBase + 1).isImm() && // scale + Inst.getOperand(AddrBase + 2).isReg() && // index register + (Inst.getOperand(AddrOp).isExpr() || // address + Inst.getOperand(AddrOp).isImm())&& + Inst.getOperand(AddrBase + 4).isReg() && // segment + "Unexpected instruction!"); + + // Check whether the destination register can be fixed. + unsigned Reg = Inst.getOperand(RegOp).getReg(); + if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX) + return; + + // Check whether this is an absolute address. + // FIXME: We know TLVP symbol refs aren't, but there should be a better way + // to do this here. + bool Absolute = true; + if (Inst.getOperand(AddrOp).isExpr()) { + const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr(); + if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE)) + if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP) + Absolute = false; + } + + if (Absolute && + (Inst.getOperand(AddrBase + 0).getReg() != 0 || + Inst.getOperand(AddrBase + 2).getReg() != 0 || + Inst.getOperand(AddrBase + 4).getReg() != 0 || + Inst.getOperand(AddrBase + 1).getImm() != 1)) + return; + + // If so, rewrite the instruction. + MCOperand Saved = Inst.getOperand(AddrOp); + Inst = MCInst(); + Inst.setOpcode(Opcode); + Inst.addOperand(Saved); +} + +void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { + OutMI.setOpcode(MI->getOpcode()); + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + + MCOperand MCOp; + switch (MO.getType()) { + default: + MI->dump(); + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_Register: + // Ignore all implicit register operands. + if (MO.isImplicit()) continue; + MCOp = MCOperand::CreateReg(MO.getReg()); + break; + case MachineOperand::MO_Immediate: + MCOp = MCOperand::CreateImm(MO.getImm()); + break; + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create( + MO.getMBB()->getSymbol(), Ctx)); + break; + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + MCOp = LowerSymbolOperand(MO, GetSymbolFromOperand(MO)); + break; + case MachineOperand::MO_JumpTableIndex: + MCOp = LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex())); + break; + case MachineOperand::MO_ConstantPoolIndex: + MCOp = LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex())); + break; + case MachineOperand::MO_BlockAddress: + MCOp = LowerSymbolOperand(MO, + AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress())); + break; + } + + OutMI.addOperand(MCOp); + } + + // Handle a few special cases to eliminate operand modifiers. +ReSimplify: + switch (OutMI.getOpcode()) { + case X86::LEA64_32r: // Handle 'subreg rewriting' for the lea64_32mem operand. + lower_lea64_32mem(&OutMI, 1); + // FALL THROUGH. + case X86::LEA64r: + case X86::LEA16r: + case X86::LEA32r: + // LEA should have a segment register, but it must be empty. + assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands && + "Unexpected # of LEA operands"); + assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 && + "LEA has segment specified!"); + break; + case X86::MOVZX64rr32: LowerSubReg32_Op0(OutMI, X86::MOV32rr); break; + case X86::MOVZX64rm32: LowerSubReg32_Op0(OutMI, X86::MOV32rm); break; + case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break; + case X86::MOVZX64rr8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break; + case X86::MOVZX64rm8: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break; + case X86::MOVZX64rr16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rr16); break; + case X86::MOVZX64rm16: LowerSubReg32_Op0(OutMI, X86::MOVZX32rm16); break; + case X86::SETB_C8r: LowerUnaryToTwoAddr(OutMI, X86::SBB8rr); break; + case X86::SETB_C16r: LowerUnaryToTwoAddr(OutMI, X86::SBB16rr); break; + case X86::SETB_C32r: LowerUnaryToTwoAddr(OutMI, X86::SBB32rr); break; + case X86::SETB_C64r: LowerUnaryToTwoAddr(OutMI, X86::SBB64rr); break; + case X86::MOV8r0: LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break; + case X86::MOV32r0: LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break; + case X86::FsFLD0SS: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; + case X86::FsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break; + case X86::VFsFLD0SS: LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break; + case X86::VFsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break; + case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break; + case X86::AVX_SET0PSY: LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break; + case X86::AVX_SET0PDY: LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break; + case X86::AVX_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break; + + case X86::MOV16r0: + LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0 + LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr + break; + case X86::MOV64r0: + LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV64r0 -> MOV32r0 + LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr + break; + + // TAILJMPr64, [WIN]CALL64r, [WIN]CALL64pcrel32 - These instructions have + // register inputs modeled as normal uses instead of implicit uses. As such, + // truncate off all but the first operand (the callee). FIXME: Change isel. + case X86::TAILJMPr64: + case X86::CALL64r: + case X86::CALL64pcrel32: + case X86::WINCALL64r: + case X86::WINCALL64pcrel32: { + unsigned Opcode = OutMI.getOpcode(); + MCOperand Saved = OutMI.getOperand(0); + OutMI = MCInst(); + OutMI.setOpcode(Opcode); + OutMI.addOperand(Saved); + break; + } + + case X86::EH_RETURN: + case X86::EH_RETURN64: { + OutMI = MCInst(); + OutMI.setOpcode(X86::RET); + break; + } + + // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions. + case X86::TAILJMPr: + case X86::TAILJMPd: + case X86::TAILJMPd64: { + unsigned Opcode; + switch (OutMI.getOpcode()) { + default: assert(0 && "Invalid opcode"); + case X86::TAILJMPr: Opcode = X86::JMP32r; break; + case X86::TAILJMPd: + case X86::TAILJMPd64: Opcode = X86::JMP_1; break; + } + + MCOperand Saved = OutMI.getOperand(0); + OutMI = MCInst(); + OutMI.setOpcode(Opcode); + OutMI.addOperand(Saved); + break; + } + + // These are pseudo-ops for OR to help with the OR->ADD transformation. We do + // this with an ugly goto in case the resultant OR uses EAX and needs the + // short form. + case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify; + case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify; + case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify; + case X86::ADD16ri_DB: OutMI.setOpcode(X86::OR16ri); goto ReSimplify; + case X86::ADD32ri_DB: OutMI.setOpcode(X86::OR32ri); goto ReSimplify; + case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify; + case X86::ADD16ri8_DB: OutMI.setOpcode(X86::OR16ri8); goto ReSimplify; + case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify; + case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify; + + // The assembler backend wants to see branches in their small form and relax + // them to their large form. The JIT can only handle the large form because + // it does not do relaxation. For now, translate the large form to the + // small one here. + case X86::JMP_4: OutMI.setOpcode(X86::JMP_1); break; + case X86::JO_4: OutMI.setOpcode(X86::JO_1); break; + case X86::JNO_4: OutMI.setOpcode(X86::JNO_1); break; + case X86::JB_4: OutMI.setOpcode(X86::JB_1); break; + case X86::JAE_4: OutMI.setOpcode(X86::JAE_1); break; + case X86::JE_4: OutMI.setOpcode(X86::JE_1); break; + case X86::JNE_4: OutMI.setOpcode(X86::JNE_1); break; + case X86::JBE_4: OutMI.setOpcode(X86::JBE_1); break; + case X86::JA_4: OutMI.setOpcode(X86::JA_1); break; + case X86::JS_4: OutMI.setOpcode(X86::JS_1); break; + case X86::JNS_4: OutMI.setOpcode(X86::JNS_1); break; + case X86::JP_4: OutMI.setOpcode(X86::JP_1); break; + case X86::JNP_4: OutMI.setOpcode(X86::JNP_1); break; + case X86::JL_4: OutMI.setOpcode(X86::JL_1); break; + case X86::JGE_4: OutMI.setOpcode(X86::JGE_1); break; + case X86::JLE_4: OutMI.setOpcode(X86::JLE_1); break; + case X86::JG_4: OutMI.setOpcode(X86::JG_1); break; + + // Atomic load and store require a separate pseudo-inst because Acquire + // implies mayStore and Release implies mayLoad; fix these to regular MOV + // instructions here + case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify; + case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify; + case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify; + case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify; + case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify; + case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify; + case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify; + case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify; + + // We don't currently select the correct instruction form for instructions + // which have a short %eax, etc. form. Handle this by custom lowering, for + // now. + // + // Note, we are currently not handling the following instructions: + // MOV64ao8, MOV64o8a + // XCHG16ar, XCHG32ar, XCHG64ar + case X86::MOV8mr_NOREX: + case X86::MOV8mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao8); break; + case X86::MOV8rm_NOREX: + case X86::MOV8rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o8a); break; + case X86::MOV16mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao16); break; + case X86::MOV16rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o16a); break; + case X86::MOV32mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break; + case X86::MOV32rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break; + + case X86::ADC8ri: SimplifyShortImmForm(OutMI, X86::ADC8i8); break; + case X86::ADC16ri: SimplifyShortImmForm(OutMI, X86::ADC16i16); break; + case X86::ADC32ri: SimplifyShortImmForm(OutMI, X86::ADC32i32); break; + case X86::ADC64ri32: SimplifyShortImmForm(OutMI, X86::ADC64i32); break; + case X86::ADD8ri: SimplifyShortImmForm(OutMI, X86::ADD8i8); break; + case X86::ADD16ri: SimplifyShortImmForm(OutMI, X86::ADD16i16); break; + case X86::ADD32ri: SimplifyShortImmForm(OutMI, X86::ADD32i32); break; + case X86::ADD64ri32: SimplifyShortImmForm(OutMI, X86::ADD64i32); break; + case X86::AND8ri: SimplifyShortImmForm(OutMI, X86::AND8i8); break; + case X86::AND16ri: SimplifyShortImmForm(OutMI, X86::AND16i16); break; + case X86::AND32ri: SimplifyShortImmForm(OutMI, X86::AND32i32); break; + case X86::AND64ri32: SimplifyShortImmForm(OutMI, X86::AND64i32); break; + case X86::CMP8ri: SimplifyShortImmForm(OutMI, X86::CMP8i8); break; + case X86::CMP16ri: SimplifyShortImmForm(OutMI, X86::CMP16i16); break; + case X86::CMP32ri: SimplifyShortImmForm(OutMI, X86::CMP32i32); break; + case X86::CMP64ri32: SimplifyShortImmForm(OutMI, X86::CMP64i32); break; + case X86::OR8ri: SimplifyShortImmForm(OutMI, X86::OR8i8); break; + case X86::OR16ri: SimplifyShortImmForm(OutMI, X86::OR16i16); break; + case X86::OR32ri: SimplifyShortImmForm(OutMI, X86::OR32i32); break; + case X86::OR64ri32: SimplifyShortImmForm(OutMI, X86::OR64i32); break; + case X86::SBB8ri: SimplifyShortImmForm(OutMI, X86::SBB8i8); break; + case X86::SBB16ri: SimplifyShortImmForm(OutMI, X86::SBB16i16); break; + case X86::SBB32ri: SimplifyShortImmForm(OutMI, X86::SBB32i32); break; + case X86::SBB64ri32: SimplifyShortImmForm(OutMI, X86::SBB64i32); break; + case X86::SUB8ri: SimplifyShortImmForm(OutMI, X86::SUB8i8); break; + case X86::SUB16ri: SimplifyShortImmForm(OutMI, X86::SUB16i16); break; + case X86::SUB32ri: SimplifyShortImmForm(OutMI, X86::SUB32i32); break; + case X86::SUB64ri32: SimplifyShortImmForm(OutMI, X86::SUB64i32); break; + case X86::TEST8ri: SimplifyShortImmForm(OutMI, X86::TEST8i8); break; + case X86::TEST16ri: SimplifyShortImmForm(OutMI, X86::TEST16i16); break; + case X86::TEST32ri: SimplifyShortImmForm(OutMI, X86::TEST32i32); break; + case X86::TEST64ri32: SimplifyShortImmForm(OutMI, X86::TEST64i32); break; + case X86::XOR8ri: SimplifyShortImmForm(OutMI, X86::XOR8i8); break; + case X86::XOR16ri: SimplifyShortImmForm(OutMI, X86::XOR16i16); break; + case X86::XOR32ri: SimplifyShortImmForm(OutMI, X86::XOR32i32); break; + case X86::XOR64ri32: SimplifyShortImmForm(OutMI, X86::XOR64i32); break; + } +} + +static void LowerTlsAddr(MCStreamer &OutStreamer, + X86MCInstLower &MCInstLowering, + const MachineInstr &MI) { + bool is64Bits = MI.getOpcode() == X86::TLS_addr64; + MCContext &context = OutStreamer.getContext(); + + if (is64Bits) { + MCInst prefix; + prefix.setOpcode(X86::DATA16_PREFIX); + OutStreamer.EmitInstruction(prefix); + } + MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3)); + const MCSymbolRefExpr *symRef = + MCSymbolRefExpr::Create(sym, MCSymbolRefExpr::VK_TLSGD, context); + + MCInst LEA; + if (is64Bits) { + LEA.setOpcode(X86::LEA64r); + LEA.addOperand(MCOperand::CreateReg(X86::RDI)); // dest + LEA.addOperand(MCOperand::CreateReg(X86::RIP)); // base + LEA.addOperand(MCOperand::CreateImm(1)); // scale + LEA.addOperand(MCOperand::CreateReg(0)); // index + LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp + LEA.addOperand(MCOperand::CreateReg(0)); // seg + } else { + LEA.setOpcode(X86::LEA32r); + LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest + LEA.addOperand(MCOperand::CreateReg(0)); // base + LEA.addOperand(MCOperand::CreateImm(1)); // scale + LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // index + LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp + LEA.addOperand(MCOperand::CreateReg(0)); // seg + } + OutStreamer.EmitInstruction(LEA); + + if (is64Bits) { + MCInst prefix; + prefix.setOpcode(X86::DATA16_PREFIX); + OutStreamer.EmitInstruction(prefix); + prefix.setOpcode(X86::DATA16_PREFIX); + OutStreamer.EmitInstruction(prefix); + prefix.setOpcode(X86::REX64_PREFIX); + OutStreamer.EmitInstruction(prefix); + } + + MCInst call; + if (is64Bits) + call.setOpcode(X86::CALL64pcrel32); + else + call.setOpcode(X86::CALLpcrel32); + StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr"; + MCSymbol *tlsGetAddr = context.GetOrCreateSymbol(name); + const MCSymbolRefExpr *tlsRef = + MCSymbolRefExpr::Create(tlsGetAddr, + MCSymbolRefExpr::VK_PLT, + context); + + call.addOperand(MCOperand::CreateExpr(tlsRef)); + OutStreamer.EmitInstruction(call); +} + +void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { + OutStreamer.EmitCodeRegion(); + + X86MCInstLower MCInstLowering(Mang, *MF, *this); + switch (MI->getOpcode()) { + case TargetOpcode::DBG_VALUE: + if (isVerbose() && OutStreamer.hasRawTextSupport()) { + std::string TmpStr; + raw_string_ostream OS(TmpStr); + PrintDebugValueComment(MI, OS); + OutStreamer.EmitRawText(StringRef(OS.str())); + } + return; + + // Emit nothing here but a comment if we can. + case X86::Int_MemBarrier: + if (OutStreamer.hasRawTextSupport()) + OutStreamer.EmitRawText(StringRef("\t#MEMBARRIER")); + return; + + + case X86::EH_RETURN: + case X86::EH_RETURN64: { + // Lower these as normal, but add some comments. + unsigned Reg = MI->getOperand(0).getReg(); + OutStreamer.AddComment(StringRef("eh_return, addr: %") + + X86ATTInstPrinter::getRegisterName(Reg)); + break; + } + case X86::TAILJMPr: + case X86::TAILJMPd: + case X86::TAILJMPd64: + // Lower these as normal, but add some comments. + OutStreamer.AddComment("TAILCALL"); + break; + + case X86::TLS_addr32: + case X86::TLS_addr64: + return LowerTlsAddr(OutStreamer, MCInstLowering, *MI); + + case X86::MOVPC32r: { + MCInst TmpInst; + // This is a pseudo op for a two instruction sequence with a label, which + // looks like: + // call "L1$pb" + // "L1$pb": + // popl %esi + + // Emit the call. + MCSymbol *PICBase = MF->getPICBaseSymbol(); + TmpInst.setOpcode(X86::CALLpcrel32); + // FIXME: We would like an efficient form for this, so we don't have to do a + // lot of extra uniquing. + TmpInst.addOperand(MCOperand::CreateExpr(MCSymbolRefExpr::Create(PICBase, + OutContext))); + OutStreamer.EmitInstruction(TmpInst); + + // Emit the label. + OutStreamer.EmitLabel(PICBase); + + // popl $reg + TmpInst.setOpcode(X86::POP32r); + TmpInst.getOperand(0) = MCOperand::CreateReg(MI->getOperand(0).getReg()); + OutStreamer.EmitInstruction(TmpInst); + return; + } + + case X86::ADD32ri: { + // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri. + if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS) + break; + + // Okay, we have something like: + // EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL) + + // For this, we want to print something like: + // MYGLOBAL + (. - PICBASE) + // However, we can't generate a ".", so just emit a new label here and refer + // to it. + MCSymbol *DotSym = OutContext.CreateTempSymbol(); + OutStreamer.EmitLabel(DotSym); + + // Now that we have emitted the label, lower the complex operand expression. + MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2)); + + const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext); + const MCExpr *PICBase = + MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), OutContext); + DotExpr = MCBinaryExpr::CreateSub(DotExpr, PICBase, OutContext); + + DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext), + DotExpr, OutContext); + + MCInst TmpInst; + TmpInst.setOpcode(X86::ADD32ri); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg())); + TmpInst.addOperand(MCOperand::CreateExpr(DotExpr)); + OutStreamer.EmitInstruction(TmpInst); + return; + } + } + + MCInst TmpInst; + MCInstLowering.Lower(MI, TmpInst); + OutStreamer.EmitInstruction(TmpInst); +} + diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.h b/contrib/llvm/lib/Target/X86/X86MCInstLower.h new file mode 100644 index 0000000..0210072 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.h @@ -0,0 +1,52 @@ +//===-- X86MCInstLower.h - Lower MachineInstr to MCInst -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef X86_MCINSTLOWER_H +#define X86_MCINSTLOWER_H + +#include "llvm/Support/Compiler.h" + +namespace llvm { + class MCAsmInfo; + class MCContext; + class MCInst; + class MCOperand; + class MCSymbol; + class MachineInstr; + class MachineFunction; + class MachineModuleInfoMachO; + class MachineOperand; + class Mangler; + class TargetMachine; + class X86AsmPrinter; + +/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst. +class LLVM_LIBRARY_VISIBILITY X86MCInstLower { + MCContext &Ctx; + Mangler *Mang; + const MachineFunction &MF; + const TargetMachine &TM; + const MCAsmInfo &MAI; + X86AsmPrinter &AsmPrinter; +public: + X86MCInstLower(Mangler *mang, const MachineFunction &MF, + X86AsmPrinter &asmprinter); + + void Lower(const MachineInstr *MI, MCInst &OutMI) const; + + MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const; + MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; + +private: + MachineModuleInfoMachO &getMachOMMI() const; +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h new file mode 100644 index 0000000..b0bb313 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -0,0 +1,135 @@ +//====- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares X86-specific per-machine-function information. +// +//===----------------------------------------------------------------------===// + +#ifndef X86MACHINEFUNCTIONINFO_H +#define X86MACHINEFUNCTIONINFO_H + +#include "llvm/CodeGen/MachineFunction.h" + +namespace llvm { + +/// X86MachineFunctionInfo - This class is derived from MachineFunction and +/// contains private X86 target-specific information for each MachineFunction. +class X86MachineFunctionInfo : public MachineFunctionInfo { + /// ForceFramePointer - True if the function is required to use of frame + /// pointer for reasons other than it containing dynamic allocation or + /// that FP eliminatation is turned off. For example, Cygwin main function + /// contains stack pointer re-alignment code which requires FP. + bool ForceFramePointer; + + /// CalleeSavedFrameSize - Size of the callee-saved register portion of the + /// stack frame in bytes. + unsigned CalleeSavedFrameSize; + + /// BytesToPopOnReturn - Number of bytes function pops on return (in addition + /// to the space used by the return address). + /// Used on windows platform for stdcall & fastcall name decoration + unsigned BytesToPopOnReturn; + + /// ReturnAddrIndex - FrameIndex for return slot. + int ReturnAddrIndex; + + /// TailCallReturnAddrDelta - The number of bytes by which return address + /// stack slot is moved as the result of tail call optimization. + int TailCallReturnAddrDelta; + + /// SRetReturnReg - Some subtargets require that sret lowering includes + /// returning the value of the returned struct in a register. This field + /// holds the virtual register into which the sret argument is passed. + unsigned SRetReturnReg; + + /// GlobalBaseReg - keeps track of the virtual register initialized for + /// use as the global base register. This is used for PIC in some PIC + /// relocation models. + unsigned GlobalBaseReg; + + /// VarArgsFrameIndex - FrameIndex for start of varargs area. + int VarArgsFrameIndex; + /// RegSaveFrameIndex - X86-64 vararg func register save area. + int RegSaveFrameIndex; + /// VarArgsGPOffset - X86-64 vararg func int reg offset. + unsigned VarArgsGPOffset; + /// VarArgsFPOffset - X86-64 vararg func fp reg offset. + unsigned VarArgsFPOffset; + /// ArgumentStackSize - The number of bytes on stack consumed by the arguments + /// being passed on the stack. + unsigned ArgumentStackSize; + +public: + X86MachineFunctionInfo() : ForceFramePointer(false), + CalleeSavedFrameSize(0), + BytesToPopOnReturn(0), + ReturnAddrIndex(0), + TailCallReturnAddrDelta(0), + SRetReturnReg(0), + GlobalBaseReg(0), + VarArgsFrameIndex(0), + RegSaveFrameIndex(0), + VarArgsGPOffset(0), + VarArgsFPOffset(0), + ArgumentStackSize(0) {} + + explicit X86MachineFunctionInfo(MachineFunction &MF) + : ForceFramePointer(false), + CalleeSavedFrameSize(0), + BytesToPopOnReturn(0), + ReturnAddrIndex(0), + TailCallReturnAddrDelta(0), + SRetReturnReg(0), + GlobalBaseReg(0), + VarArgsFrameIndex(0), + RegSaveFrameIndex(0), + VarArgsGPOffset(0), + VarArgsFPOffset(0), + ArgumentStackSize(0) {} + + bool getForceFramePointer() const { return ForceFramePointer;} + void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } + + unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; } + void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; } + + unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; } + void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;} + + int getRAIndex() const { return ReturnAddrIndex; } + void setRAIndex(int Index) { ReturnAddrIndex = Index; } + + int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; } + void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;} + + unsigned getSRetReturnReg() const { return SRetReturnReg; } + void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; } + + unsigned getGlobalBaseReg() const { return GlobalBaseReg; } + void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; } + + int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } + void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; } + + int getRegSaveFrameIndex() const { return RegSaveFrameIndex; } + void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; } + + unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; } + void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; } + + unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; } + void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; } + + unsigned getArgumentStackSize() const { return ArgumentStackSize; } + void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; } +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp new file mode 100644 index 0000000..c1ac9f3 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -0,0 +1,870 @@ +//===- X86RegisterInfo.cpp - X86 Register Information -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetRegisterInfo class. +// This file is responsible for the frame pointer elimination optimization +// on X86. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86RegisterInfo.h" +#include "X86InstrBuilder.h" +#include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" +#include "X86TargetMachine.h" +#include "llvm/Constants.h" +#include "llvm/Function.h" +#include "llvm/Type.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/CommandLine.h" + +#define GET_REGINFO_TARGET_DESC +#include "X86GenRegisterInfo.inc" + +using namespace llvm; + +cl::opt<bool> +ForceStackAlign("force-align-stack", + cl::desc("Force align the stack to the minimum alignment" + " needed for the function."), + cl::init(false), cl::Hidden); + +X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm, + const TargetInstrInfo &tii) + : X86GenRegisterInfo(tm.getSubtarget<X86Subtarget>().is64Bit() + ? X86::RIP : X86::EIP, + X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false), + X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true)), + TM(tm), TII(tii) { + X86_MC::InitLLVM2SEHRegisterMapping(this); + + // Cache some information. + const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>(); + Is64Bit = Subtarget->is64Bit(); + IsWin64 = Subtarget->isTargetWin64(); + + if (Is64Bit) { + SlotSize = 8; + StackPtr = X86::RSP; + FramePtr = X86::RBP; + } else { + SlotSize = 4; + StackPtr = X86::ESP; + FramePtr = X86::EBP; + } +} + +/// getCompactUnwindRegNum - This function maps the register to the number for +/// compact unwind encoding. Return -1 if the register isn't valid. +int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const { + switch (getLLVMRegNum(RegNum, isEH)) { + case X86::EBX: case X86::RBX: return 1; + case X86::ECX: case X86::R12: return 2; + case X86::EDX: case X86::R13: return 3; + case X86::EDI: case X86::R14: return 4; + case X86::ESI: case X86::R15: return 5; + case X86::EBP: case X86::RBP: return 6; + } + + return -1; +} + +int +X86RegisterInfo::getSEHRegNum(unsigned i) const { + int reg = X86_MC::getX86RegNum(i); + switch (i) { + case X86::R8: case X86::R8D: case X86::R8W: case X86::R8B: + case X86::R9: case X86::R9D: case X86::R9W: case X86::R9B: + case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B: + case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B: + case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B: + case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B: + case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B: + case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B: + case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11: + case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15: + case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11: + case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15: + reg += 8; + } + return reg; +} + +const TargetRegisterClass * +X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC, + unsigned Idx) const { + // The sub_8bit sub-register index is more constrained in 32-bit mode. + // It behaves just like the sub_8bit_hi index. + if (!Is64Bit && Idx == X86::sub_8bit) + Idx = X86::sub_8bit_hi; + + // Forward to TableGen's default version. + return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx); +} + +const TargetRegisterClass * +X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, + const TargetRegisterClass *B, + unsigned SubIdx) const { + switch (SubIdx) { + default: return 0; + case X86::sub_8bit: + if (B == &X86::GR8RegClass) { + if (A->getSize() == 2 || A->getSize() == 4 || A->getSize() == 8) + return A; + } else if (B == &X86::GR8_ABCD_LRegClass || B == &X86::GR8_ABCD_HRegClass) { + if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass || + A == &X86::GR64_NOREXRegClass || + A == &X86::GR64_NOSPRegClass || + A == &X86::GR64_NOREX_NOSPRegClass) + return &X86::GR64_ABCDRegClass; + else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass || + A == &X86::GR32_NOREXRegClass || + A == &X86::GR32_NOSPRegClass) + return &X86::GR32_ABCDRegClass; + else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass || + A == &X86::GR16_NOREXRegClass) + return &X86::GR16_ABCDRegClass; + } else if (B == &X86::GR8_NOREXRegClass) { + if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass || + A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass) + return &X86::GR64_NOREXRegClass; + else if (A == &X86::GR64_ABCDRegClass) + return &X86::GR64_ABCDRegClass; + else if (A == &X86::GR32RegClass || A == &X86::GR32_NOREXRegClass || + A == &X86::GR32_NOSPRegClass) + return &X86::GR32_NOREXRegClass; + else if (A == &X86::GR32_ABCDRegClass) + return &X86::GR32_ABCDRegClass; + else if (A == &X86::GR16RegClass || A == &X86::GR16_NOREXRegClass) + return &X86::GR16_NOREXRegClass; + else if (A == &X86::GR16_ABCDRegClass) + return &X86::GR16_ABCDRegClass; + } + break; + case X86::sub_8bit_hi: + if (B->hasSubClassEq(&X86::GR8_ABCD_HRegClass)) + switch (A->getSize()) { + case 2: return getCommonSubClass(A, &X86::GR16_ABCDRegClass); + case 4: return getCommonSubClass(A, &X86::GR32_ABCDRegClass); + case 8: return getCommonSubClass(A, &X86::GR64_ABCDRegClass); + default: return 0; + } + break; + case X86::sub_16bit: + if (B == &X86::GR16RegClass) { + if (A->getSize() == 4 || A->getSize() == 8) + return A; + } else if (B == &X86::GR16_ABCDRegClass) { + if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass || + A == &X86::GR64_NOREXRegClass || + A == &X86::GR64_NOSPRegClass || + A == &X86::GR64_NOREX_NOSPRegClass) + return &X86::GR64_ABCDRegClass; + else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass || + A == &X86::GR32_NOREXRegClass || A == &X86::GR32_NOSPRegClass) + return &X86::GR32_ABCDRegClass; + } else if (B == &X86::GR16_NOREXRegClass) { + if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass || + A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass) + return &X86::GR64_NOREXRegClass; + else if (A == &X86::GR64_ABCDRegClass) + return &X86::GR64_ABCDRegClass; + else if (A == &X86::GR32RegClass || A == &X86::GR32_NOREXRegClass || + A == &X86::GR32_NOSPRegClass) + return &X86::GR32_NOREXRegClass; + else if (A == &X86::GR32_ABCDRegClass) + return &X86::GR64_ABCDRegClass; + } + break; + case X86::sub_32bit: + if (B == &X86::GR32RegClass) { + if (A->getSize() == 8) + return A; + } else if (B == &X86::GR32_NOSPRegClass) { + if (A == &X86::GR64RegClass || A == &X86::GR64_NOSPRegClass) + return &X86::GR64_NOSPRegClass; + if (A->getSize() == 8) + return getCommonSubClass(A, &X86::GR64_NOSPRegClass); + } else if (B == &X86::GR32_ABCDRegClass) { + if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass || + A == &X86::GR64_NOREXRegClass || + A == &X86::GR64_NOSPRegClass || + A == &X86::GR64_NOREX_NOSPRegClass) + return &X86::GR64_ABCDRegClass; + } else if (B == &X86::GR32_NOREXRegClass) { + if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass) + return &X86::GR64_NOREXRegClass; + else if (A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass) + return &X86::GR64_NOREX_NOSPRegClass; + else if (A == &X86::GR64_ABCDRegClass) + return &X86::GR64_ABCDRegClass; + } else if (B == &X86::GR32_NOREX_NOSPRegClass) { + if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass || + A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass) + return &X86::GR64_NOREX_NOSPRegClass; + else if (A == &X86::GR64_ABCDRegClass) + return &X86::GR64_ABCDRegClass; + } + break; + case X86::sub_ss: + if (B == &X86::FR32RegClass) + return A; + break; + case X86::sub_sd: + if (B == &X86::FR64RegClass) + return A; + break; + case X86::sub_xmm: + if (B == &X86::VR128RegClass) + return A; + break; + } + return 0; +} + +const TargetRegisterClass* +X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ + // Don't allow super-classes of GR8_NOREX. This class is only used after + // extrating sub_8bit_hi sub-registers. The H sub-registers cannot be copied + // to the full GR8 register class in 64-bit mode, so we cannot allow the + // reigster class inflation. + // + // The GR8_NOREX class is always used in a way that won't be constrained to a + // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the + // full GR8 class. + if (RC == X86::GR8_NOREXRegisterClass) + return RC; + + const TargetRegisterClass *Super = RC; + TargetRegisterClass::sc_iterator I = RC->getSuperClasses(); + do { + switch (Super->getID()) { + case X86::GR8RegClassID: + case X86::GR16RegClassID: + case X86::GR32RegClassID: + case X86::GR64RegClassID: + case X86::FR32RegClassID: + case X86::FR64RegClassID: + case X86::RFP32RegClassID: + case X86::RFP64RegClassID: + case X86::RFP80RegClassID: + case X86::VR128RegClassID: + case X86::VR256RegClassID: + // Don't return a super-class that would shrink the spill size. + // That can happen with the vector and float classes. + if (Super->getSize() == RC->getSize()) + return Super; + } + Super = *I++; + } while (Super); + return RC; +} + +const TargetRegisterClass * +X86RegisterInfo::getPointerRegClass(unsigned Kind) const { + switch (Kind) { + default: llvm_unreachable("Unexpected Kind in getPointerRegClass!"); + case 0: // Normal GPRs. + if (TM.getSubtarget<X86Subtarget>().is64Bit()) + return &X86::GR64RegClass; + return &X86::GR32RegClass; + case 1: // Normal GPRs except the stack pointer (for encoding reasons). + if (TM.getSubtarget<X86Subtarget>().is64Bit()) + return &X86::GR64_NOSPRegClass; + return &X86::GR32_NOSPRegClass; + case 2: // Available for tailcall (not callee-saved GPRs). + if (TM.getSubtarget<X86Subtarget>().isTargetWin64()) + return &X86::GR64_TCW64RegClass; + if (TM.getSubtarget<X86Subtarget>().is64Bit()) + return &X86::GR64_TCRegClass; + return &X86::GR32_TCRegClass; + } +} + +const TargetRegisterClass * +X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { + if (RC == &X86::CCRRegClass) { + if (Is64Bit) + return &X86::GR64RegClass; + else + return &X86::GR32RegClass; + } + return RC; +} + +unsigned +X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; + switch (RC->getID()) { + default: + return 0; + case X86::GR32RegClassID: + return 4 - FPDiff; + case X86::GR64RegClassID: + return 12 - FPDiff; + case X86::VR128RegClassID: + return TM.getSubtarget<X86Subtarget>().is64Bit() ? 10 : 4; + case X86::VR64RegClassID: + return 4; + } +} + +const unsigned * +X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + bool callsEHReturn = false; + bool ghcCall = false; + + if (MF) { + callsEHReturn = MF->getMMI().callsEHReturn(); + const Function *F = MF->getFunction(); + ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false); + } + + static const unsigned GhcCalleeSavedRegs[] = { + 0 + }; + + static const unsigned CalleeSavedRegs32Bit[] = { + X86::ESI, X86::EDI, X86::EBX, X86::EBP, 0 + }; + + static const unsigned CalleeSavedRegs32EHRet[] = { + X86::EAX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP, 0 + }; + + static const unsigned CalleeSavedRegs64Bit[] = { + X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 + }; + + static const unsigned CalleeSavedRegs64EHRet[] = { + X86::RAX, X86::RDX, X86::RBX, X86::R12, + X86::R13, X86::R14, X86::R15, X86::RBP, 0 + }; + + static const unsigned CalleeSavedRegsWin64[] = { + X86::RBX, X86::RBP, X86::RDI, X86::RSI, + X86::R12, X86::R13, X86::R14, X86::R15, + X86::XMM6, X86::XMM7, X86::XMM8, X86::XMM9, + X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13, + X86::XMM14, X86::XMM15, 0 + }; + + if (ghcCall) { + return GhcCalleeSavedRegs; + } else if (Is64Bit) { + if (IsWin64) + return CalleeSavedRegsWin64; + else + return (callsEHReturn ? CalleeSavedRegs64EHRet : CalleeSavedRegs64Bit); + } else { + return (callsEHReturn ? CalleeSavedRegs32EHRet : CalleeSavedRegs32Bit); + } +} + +BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { + BitVector Reserved(getNumRegs()); + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + // Set the stack-pointer register and its aliases as reserved. + Reserved.set(X86::RSP); + Reserved.set(X86::ESP); + Reserved.set(X86::SP); + Reserved.set(X86::SPL); + + // Set the instruction pointer register and its aliases as reserved. + Reserved.set(X86::RIP); + Reserved.set(X86::EIP); + Reserved.set(X86::IP); + + // Set the frame-pointer register and its aliases as reserved if needed. + if (TFI->hasFP(MF)) { + Reserved.set(X86::RBP); + Reserved.set(X86::EBP); + Reserved.set(X86::BP); + Reserved.set(X86::BPL); + } + + // Mark the segment registers as reserved. + Reserved.set(X86::CS); + Reserved.set(X86::SS); + Reserved.set(X86::DS); + Reserved.set(X86::ES); + Reserved.set(X86::FS); + Reserved.set(X86::GS); + + // Reserve the registers that only exist in 64-bit mode. + if (!Is64Bit) { + // These 8-bit registers are part of the x86-64 extension even though their + // super-registers are old 32-bits. + Reserved.set(X86::SIL); + Reserved.set(X86::DIL); + Reserved.set(X86::BPL); + Reserved.set(X86::SPL); + + for (unsigned n = 0; n != 8; ++n) { + // R8, R9, ... + const unsigned GPR64[] = { + X86::R8, X86::R9, X86::R10, X86::R11, + X86::R12, X86::R13, X86::R14, X86::R15 + }; + for (const unsigned *AI = getOverlaps(GPR64[n]); unsigned Reg = *AI; ++AI) + Reserved.set(Reg); + + // XMM8, XMM9, ... + assert(X86::XMM15 == X86::XMM8+7); + for (const unsigned *AI = getOverlaps(X86::XMM8 + n); unsigned Reg = *AI; + ++AI) + Reserved.set(Reg); + } + } + + return Reserved; +} + +//===----------------------------------------------------------------------===// +// Stack Frame Processing methods +//===----------------------------------------------------------------------===// + +bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + return (RealignStack && + !MFI->hasVarSizedObjects()); +} + +bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const Function *F = MF.getFunction(); + unsigned StackAlign = TM.getFrameLowering()->getStackAlignment(); + bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || + F->hasFnAttr(Attribute::StackAlignment)); + + // FIXME: Currently we don't support stack realignment for functions with + // variable-sized allocas. + // FIXME: It's more complicated than this... + if (0 && requiresRealignment && MFI->hasVarSizedObjects()) + report_fatal_error( + "Stack realignment in presence of dynamic allocas is not supported"); + + // If we've requested that we force align the stack do so now. + if (ForceStackAlign) + return canRealignStack(MF); + + return requiresRealignment && canRealignStack(MF); +} + +bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, + unsigned Reg, int &FrameIdx) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + if (Reg == FramePtr && TFI->hasFP(MF)) { + FrameIdx = MF.getFrameInfo()->getObjectIndexBegin(); + return true; + } + return false; +} + +static unsigned getSUBriOpcode(unsigned is64Bit, int64_t Imm) { + if (is64Bit) { + if (isInt<8>(Imm)) + return X86::SUB64ri8; + return X86::SUB64ri32; + } else { + if (isInt<8>(Imm)) + return X86::SUB32ri8; + return X86::SUB32ri; + } +} + +static unsigned getADDriOpcode(unsigned is64Bit, int64_t Imm) { + if (is64Bit) { + if (isInt<8>(Imm)) + return X86::ADD64ri8; + return X86::ADD64ri32; + } else { + if (isInt<8>(Imm)) + return X86::ADD32ri8; + return X86::ADD32ri; + } +} + +void X86RegisterInfo:: +eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + bool reseveCallFrame = TFI->hasReservedCallFrame(MF); + int Opcode = I->getOpcode(); + bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); + DebugLoc DL = I->getDebugLoc(); + uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0; + uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0; + I = MBB.erase(I); + + if (!reseveCallFrame) { + // If the stack pointer can be changed after prologue, turn the + // adjcallstackup instruction into a 'sub ESP, <amt>' and the + // adjcallstackdown instruction into 'add ESP, <amt>' + // TODO: consider using push / pop instead of sub + store / add + if (Amount == 0) + return; + + // We need to keep the stack aligned properly. To do this, we round the + // amount of space needed for the outgoing arguments up to the next + // alignment boundary. + unsigned StackAlign = TM.getFrameLowering()->getStackAlignment(); + Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; + + MachineInstr *New = 0; + if (Opcode == TII.getCallFrameSetupOpcode()) { + New = BuildMI(MF, DL, TII.get(getSUBriOpcode(Is64Bit, Amount)), + StackPtr) + .addReg(StackPtr) + .addImm(Amount); + } else { + assert(Opcode == TII.getCallFrameDestroyOpcode()); + + // Factor out the amount the callee already popped. + Amount -= CalleeAmt; + + if (Amount) { + unsigned Opc = getADDriOpcode(Is64Bit, Amount); + New = BuildMI(MF, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr).addImm(Amount); + } + } + + if (New) { + // The EFLAGS implicit def is dead. + New->getOperand(3).setIsDead(); + + // Replace the pseudo instruction with a new instruction. + MBB.insert(I, New); + } + + return; + } + + if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) { + // If we are performing frame pointer elimination and if the callee pops + // something off the stack pointer, add it back. We do this until we have + // more advanced stack pointer tracking ability. + unsigned Opc = getSUBriOpcode(Is64Bit, CalleeAmt); + MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr).addImm(CalleeAmt); + + // The EFLAGS implicit def is dead. + New->getOperand(3).setIsDead(); + + // We are not tracking the stack pointer adjustment by the callee, so make + // sure we restore the stack pointer immediately after the call, there may + // be spill code inserted between the CALL and ADJCALLSTACKUP instructions. + MachineBasicBlock::iterator B = MBB.begin(); + while (I != B && !llvm::prior(I)->getDesc().isCall()) + --I; + MBB.insert(I, New); + } +} + +void +X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS) const{ + assert(SPAdj == 0 && "Unexpected"); + + unsigned i = 0; + MachineInstr &MI = *II; + MachineFunction &MF = *MI.getParent()->getParent(); + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + while (!MI.getOperand(i).isFI()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + + int FrameIndex = MI.getOperand(i).getIndex(); + unsigned BasePtr; + + unsigned Opc = MI.getOpcode(); + bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm; + if (needsStackRealignment(MF)) + BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr); + else if (AfterFPPop) + BasePtr = StackPtr; + else + BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr); + + // This must be part of a four operand memory reference. Replace the + // FrameIndex with base register with EBP. Add an offset to the offset. + MI.getOperand(i).ChangeToRegister(BasePtr, false); + + // Now add the frame object offset to the offset from EBP. + int FIOffset; + if (AfterFPPop) { + // Tail call jmp happens after FP is popped. + const MachineFrameInfo *MFI = MF.getFrameInfo(); + FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea(); + } else + FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex); + + if (MI.getOperand(i+3).isImm()) { + // Offset is a 32-bit integer. + int Imm = (int)(MI.getOperand(i + 3).getImm()); + int Offset = FIOffset + Imm; + assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) && + "Requesting 64-bit offset in 32-bit immediate!"); + MI.getOperand(i + 3).ChangeToImmediate(Offset); + } else { + // Offset is symbolic. This is extremely rare. + uint64_t Offset = FIOffset + (uint64_t)MI.getOperand(i+3).getOffset(); + MI.getOperand(i+3).setOffset(Offset); + } +} + +unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + return TFI->hasFP(MF) ? FramePtr : StackPtr; +} + +unsigned X86RegisterInfo::getEHExceptionRegister() const { + llvm_unreachable("What is the exception register"); + return 0; +} + +unsigned X86RegisterInfo::getEHHandlerRegister() const { + llvm_unreachable("What is the exception handler register"); + return 0; +} + +namespace llvm { +unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) { + switch (VT.getSimpleVT().SimpleTy) { + default: return Reg; + case MVT::i8: + if (High) { + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AH; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DH; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CH; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BH; + } + } else { + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AL; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DL; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CL; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BL; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SIL; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DIL; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BPL; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SPL; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8B; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9B; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10B; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11B; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12B; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13B; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14B; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15B; + } + } + case MVT::i16: + switch (Reg) { + default: return Reg; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8W; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9W; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10W; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11W; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12W; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13W; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14W; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15W; + } + case MVT::i32: + switch (Reg) { + default: return Reg; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::EAX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::EDX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::ECX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::EBX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::ESI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::EDI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::EBP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::ESP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8D; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9D; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10D; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11D; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12D; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13D; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14D; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15D; + } + case MVT::i64: + switch (Reg) { + default: return Reg; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::RAX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::RDX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::RCX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::RBX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::RSI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::RDI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::RBP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::RSP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15; + } + } + + return Reg; +} +} + +namespace { + struct MSAH : public MachineFunctionPass { + static char ID; + MSAH() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF) { + const X86TargetMachine *TM = + static_cast<const X86TargetMachine *>(&MF.getTarget()); + const TargetFrameLowering *TFI = TM->getFrameLowering(); + MachineRegisterInfo &RI = MF.getRegInfo(); + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + unsigned StackAlignment = TFI->getStackAlignment(); + + // Be over-conservative: scan over all vreg defs and find whether vector + // registers are used. If yes, there is a possibility that vector register + // will be spilled and thus require dynamic stack realignment. + for (unsigned i = 0, e = RI.getNumVirtRegs(); i != e; ++i) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(i); + if (RI.getRegClass(Reg)->getAlignment() > StackAlignment) { + FuncInfo->setForceFramePointer(true); + return true; + } + } + // Nothing to do + return false; + } + + virtual const char *getPassName() const { + return "X86 Maximal Stack Alignment Check"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + }; + + char MSAH::ID = 0; +} + +FunctionPass* +llvm::createX86MaxStackAlignmentHeuristicPass() { return new MSAH(); } diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h new file mode 100644 index 0000000..7d39c68 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h @@ -0,0 +1,138 @@ +//===- X86RegisterInfo.h - X86 Register Information Impl --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the X86 implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef X86REGISTERINFO_H +#define X86REGISTERINFO_H + +#include "llvm/Target/TargetRegisterInfo.h" + +#define GET_REGINFO_HEADER +#include "X86GenRegisterInfo.inc" + +namespace llvm { + class Type; + class TargetInstrInfo; + class X86TargetMachine; + +class X86RegisterInfo : public X86GenRegisterInfo { +public: + X86TargetMachine &TM; + const TargetInstrInfo &TII; + +private: + /// Is64Bit - Is the target 64-bits. + /// + bool Is64Bit; + + /// IsWin64 - Is the target on of win64 flavours + /// + bool IsWin64; + + /// SlotSize - Stack slot size in bytes. + /// + unsigned SlotSize; + + /// StackPtr - X86 physical register used as stack ptr. + /// + unsigned StackPtr; + + /// FramePtr - X86 physical register used as frame ptr. + /// + unsigned FramePtr; + +public: + X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii); + + /// getX86RegNum - Returns the native X86 register number for the given LLVM + /// register identifier. + static unsigned getX86RegNum(unsigned RegNo); + + // FIXME: This should be tablegen'd like getDwarfRegNum is + int getSEHRegNum(unsigned i) const; + + /// getCompactUnwindRegNum - This function maps the register to the number for + /// compact unwind encoding. Return -1 if the register isn't valid. + int getCompactUnwindRegNum(unsigned RegNum, bool isEH) const; + + /// Code Generation virtual methods... + /// + + /// getMatchingSuperRegClass - Return a subclass of the specified register + /// class A so that each register in it has a sub-register of the + /// specified sub-register index which is in the specified register class B. + virtual const TargetRegisterClass * + getMatchingSuperRegClass(const TargetRegisterClass *A, + const TargetRegisterClass *B, unsigned Idx) const; + + virtual const TargetRegisterClass * + getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx) const; + + const TargetRegisterClass* + getLargestLegalSuperClass(const TargetRegisterClass *RC) const; + + /// getPointerRegClass - Returns a TargetRegisterClass used for pointer + /// values. + const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const; + + /// getCrossCopyRegClass - Returns a legal register class to copy a register + /// in the specified class to or from. Returns NULL if it is possible to copy + /// between a two registers of the specified class. + const TargetRegisterClass * + getCrossCopyRegClass(const TargetRegisterClass *RC) const; + + unsigned getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const; + + /// getCalleeSavedRegs - Return a null-terminated list of all of the + /// callee-save registers on this target. + const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const; + + /// getReservedRegs - Returns a bitset indexed by physical register number + /// indicating if a register is a special register that has particular uses and + /// should be considered unavailable at all times, e.g. SP, RA. This is used by + /// register scavenger to determine what registers are free. + BitVector getReservedRegs(const MachineFunction &MF) const; + + bool canRealignStack(const MachineFunction &MF) const; + + bool needsStackRealignment(const MachineFunction &MF) const; + + bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, + int &FrameIdx) const; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const; + + void eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, RegScavenger *RS = NULL) const; + + // Debug information queries. + unsigned getFrameRegister(const MachineFunction &MF) const; + unsigned getStackRegister() const { return StackPtr; } + // FIXME: Move to FrameInfok + unsigned getSlotSize() const { return SlotSize; } + + // Exception handling queries. + unsigned getEHExceptionRegister() const; + unsigned getEHHandlerRegister() const; +}; + +// getX86SubSuperRegister - X86 utility function. It returns the sub or super +// register of a specific X86 register. +// e.g. getX86SubSuperRegister(X86::EAX, EVT::i16) return X86:AX +unsigned getX86SubSuperRegister(unsigned, EVT, bool High=false); + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td new file mode 100644 index 0000000..9a7db36 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td @@ -0,0 +1,474 @@ +//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 Register file, defining the registers themselves, +// aliases between the registers, and the register classes built out of the +// registers. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Register definitions... +// +let Namespace = "X86" in { + + // Subregister indices. + def sub_8bit : SubRegIndex; + def sub_8bit_hi : SubRegIndex; + def sub_16bit : SubRegIndex; + def sub_32bit : SubRegIndex; + + def sub_ss : SubRegIndex; + def sub_sd : SubRegIndex; + def sub_xmm : SubRegIndex; + + + // In the register alias definitions below, we define which registers alias + // which others. We only specify which registers the small registers alias, + // because the register file generator is smart enough to figure out that + // AL aliases AX if we tell it that AX aliased AL (for example). + + // Dwarf numbering is different for 32-bit and 64-bit, and there are + // variations by target as well. Currently the first entry is for X86-64, + // second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux + // and debug information on X86-32/Darwin) + + // 8-bit registers + // Low registers + def AL : Register<"al">; + def DL : Register<"dl">; + def CL : Register<"cl">; + def BL : Register<"bl">; + + // X86-64 only, requires REX. + let CostPerUse = 1 in { + def SIL : Register<"sil">; + def DIL : Register<"dil">; + def BPL : Register<"bpl">; + def SPL : Register<"spl">; + def R8B : Register<"r8b">; + def R9B : Register<"r9b">; + def R10B : Register<"r10b">; + def R11B : Register<"r11b">; + def R12B : Register<"r12b">; + def R13B : Register<"r13b">; + def R14B : Register<"r14b">; + def R15B : Register<"r15b">; + } + + // High registers. On x86-64, these cannot be used in any instruction + // with a REX prefix. + def AH : Register<"ah">; + def DH : Register<"dh">; + def CH : Register<"ch">; + def BH : Register<"bh">; + + // 16-bit registers + let SubRegIndices = [sub_8bit, sub_8bit_hi] in { + def AX : RegisterWithSubRegs<"ax", [AL,AH]>; + def DX : RegisterWithSubRegs<"dx", [DL,DH]>; + def CX : RegisterWithSubRegs<"cx", [CL,CH]>; + def BX : RegisterWithSubRegs<"bx", [BL,BH]>; + } + let SubRegIndices = [sub_8bit] in { + def SI : RegisterWithSubRegs<"si", [SIL]>; + def DI : RegisterWithSubRegs<"di", [DIL]>; + def BP : RegisterWithSubRegs<"bp", [BPL]>; + def SP : RegisterWithSubRegs<"sp", [SPL]>; + } + def IP : Register<"ip">; + + // X86-64 only, requires REX. + let SubRegIndices = [sub_8bit], CostPerUse = 1 in { + def R8W : RegisterWithSubRegs<"r8w", [R8B]>; + def R9W : RegisterWithSubRegs<"r9w", [R9B]>; + def R10W : RegisterWithSubRegs<"r10w", [R10B]>; + def R11W : RegisterWithSubRegs<"r11w", [R11B]>; + def R12W : RegisterWithSubRegs<"r12w", [R12B]>; + def R13W : RegisterWithSubRegs<"r13w", [R13B]>; + def R14W : RegisterWithSubRegs<"r14w", [R14B]>; + def R15W : RegisterWithSubRegs<"r15w", [R15B]>; + } + // 32-bit registers + let SubRegIndices = [sub_16bit] in { + def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[-2, 0, 0]>; + def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[-2, 2, 2]>; + def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[-2, 1, 1]>; + def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[-2, 3, 3]>; + def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[-2, 6, 6]>; + def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[-2, 7, 7]>; + def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[-2, 4, 5]>; + def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[-2, 5, 4]>; + def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[-2, 8, 8]>; + + // X86-64 only, requires REX + let CostPerUse = 1 in { + def R8D : RegisterWithSubRegs<"r8d", [R8W]>; + def R9D : RegisterWithSubRegs<"r9d", [R9W]>; + def R10D : RegisterWithSubRegs<"r10d", [R10W]>; + def R11D : RegisterWithSubRegs<"r11d", [R11W]>; + def R12D : RegisterWithSubRegs<"r12d", [R12W]>; + def R13D : RegisterWithSubRegs<"r13d", [R13W]>; + def R14D : RegisterWithSubRegs<"r14d", [R14W]>; + def R15D : RegisterWithSubRegs<"r15d", [R15W]>; + }} + + // 64-bit registers, X86-64 only + let SubRegIndices = [sub_32bit] in { + def RAX : RegisterWithSubRegs<"rax", [EAX]>, DwarfRegNum<[0, -2, -2]>; + def RDX : RegisterWithSubRegs<"rdx", [EDX]>, DwarfRegNum<[1, -2, -2]>; + def RCX : RegisterWithSubRegs<"rcx", [ECX]>, DwarfRegNum<[2, -2, -2]>; + def RBX : RegisterWithSubRegs<"rbx", [EBX]>, DwarfRegNum<[3, -2, -2]>; + def RSI : RegisterWithSubRegs<"rsi", [ESI]>, DwarfRegNum<[4, -2, -2]>; + def RDI : RegisterWithSubRegs<"rdi", [EDI]>, DwarfRegNum<[5, -2, -2]>; + def RBP : RegisterWithSubRegs<"rbp", [EBP]>, DwarfRegNum<[6, -2, -2]>; + def RSP : RegisterWithSubRegs<"rsp", [ESP]>, DwarfRegNum<[7, -2, -2]>; + + // These also require REX. + let CostPerUse = 1 in { + def R8 : RegisterWithSubRegs<"r8", [R8D]>, DwarfRegNum<[8, -2, -2]>; + def R9 : RegisterWithSubRegs<"r9", [R9D]>, DwarfRegNum<[9, -2, -2]>; + def R10 : RegisterWithSubRegs<"r10", [R10D]>, DwarfRegNum<[10, -2, -2]>; + def R11 : RegisterWithSubRegs<"r11", [R11D]>, DwarfRegNum<[11, -2, -2]>; + def R12 : RegisterWithSubRegs<"r12", [R12D]>, DwarfRegNum<[12, -2, -2]>; + def R13 : RegisterWithSubRegs<"r13", [R13D]>, DwarfRegNum<[13, -2, -2]>; + def R14 : RegisterWithSubRegs<"r14", [R14D]>, DwarfRegNum<[14, -2, -2]>; + def R15 : RegisterWithSubRegs<"r15", [R15D]>, DwarfRegNum<[15, -2, -2]>; + def RIP : RegisterWithSubRegs<"rip", [EIP]>, DwarfRegNum<[16, -2, -2]>; + }} + + // MMX Registers. These are actually aliased to ST0 .. ST7 + def MM0 : Register<"mm0">, DwarfRegNum<[41, 29, 29]>; + def MM1 : Register<"mm1">, DwarfRegNum<[42, 30, 30]>; + def MM2 : Register<"mm2">, DwarfRegNum<[43, 31, 31]>; + def MM3 : Register<"mm3">, DwarfRegNum<[44, 32, 32]>; + def MM4 : Register<"mm4">, DwarfRegNum<[45, 33, 33]>; + def MM5 : Register<"mm5">, DwarfRegNum<[46, 34, 34]>; + def MM6 : Register<"mm6">, DwarfRegNum<[47, 35, 35]>; + def MM7 : Register<"mm7">, DwarfRegNum<[48, 36, 36]>; + + // Pseudo Floating Point registers + def FP0 : Register<"fp0">; + def FP1 : Register<"fp1">; + def FP2 : Register<"fp2">; + def FP3 : Register<"fp3">; + def FP4 : Register<"fp4">; + def FP5 : Register<"fp5">; + def FP6 : Register<"fp6">; + + // XMM Registers, used by the various SSE instruction set extensions. + // The sub_ss and sub_sd subregs are the same registers with another regclass. + let CompositeIndices = [(sub_ss), (sub_sd)] in { + def XMM0: Register<"xmm0">, DwarfRegNum<[17, 21, 21]>; + def XMM1: Register<"xmm1">, DwarfRegNum<[18, 22, 22]>; + def XMM2: Register<"xmm2">, DwarfRegNum<[19, 23, 23]>; + def XMM3: Register<"xmm3">, DwarfRegNum<[20, 24, 24]>; + def XMM4: Register<"xmm4">, DwarfRegNum<[21, 25, 25]>; + def XMM5: Register<"xmm5">, DwarfRegNum<[22, 26, 26]>; + def XMM6: Register<"xmm6">, DwarfRegNum<[23, 27, 27]>; + def XMM7: Register<"xmm7">, DwarfRegNum<[24, 28, 28]>; + + // X86-64 only + let CostPerUse = 1 in { + def XMM8: Register<"xmm8">, DwarfRegNum<[25, -2, -2]>; + def XMM9: Register<"xmm9">, DwarfRegNum<[26, -2, -2]>; + def XMM10: Register<"xmm10">, DwarfRegNum<[27, -2, -2]>; + def XMM11: Register<"xmm11">, DwarfRegNum<[28, -2, -2]>; + def XMM12: Register<"xmm12">, DwarfRegNum<[29, -2, -2]>; + def XMM13: Register<"xmm13">, DwarfRegNum<[30, -2, -2]>; + def XMM14: Register<"xmm14">, DwarfRegNum<[31, -2, -2]>; + def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>; + }} + + // YMM Registers, used by AVX instructions + let SubRegIndices = [sub_xmm] in { + def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegAlias<XMM0>; + def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegAlias<XMM1>; + def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegAlias<XMM2>; + def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegAlias<XMM3>; + def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegAlias<XMM4>; + def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegAlias<XMM5>; + def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegAlias<XMM6>; + def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegAlias<XMM7>; + def YMM8: RegisterWithSubRegs<"ymm8", [XMM8]>, DwarfRegAlias<XMM8>; + def YMM9: RegisterWithSubRegs<"ymm9", [XMM9]>, DwarfRegAlias<XMM9>; + def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegAlias<XMM10>; + def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegAlias<XMM11>; + def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegAlias<XMM12>; + def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegAlias<XMM13>; + def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegAlias<XMM14>; + def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegAlias<XMM15>; + } + + class STRegister<string Name, list<Register> A> : Register<Name> { + let Aliases = A; + } + + // Floating point stack registers. These don't map one-to-one to the FP + // pseudo registers, but we still mark them as aliasing FP registers. That + // way both kinds can be live without exceeding the stack depth. ST registers + // are only live around inline assembly. + def ST0 : STRegister<"st(0)", []>, DwarfRegNum<[33, 12, 11]>; + def ST1 : STRegister<"st(1)", [FP6]>, DwarfRegNum<[34, 13, 12]>; + def ST2 : STRegister<"st(2)", [FP5]>, DwarfRegNum<[35, 14, 13]>; + def ST3 : STRegister<"st(3)", [FP4]>, DwarfRegNum<[36, 15, 14]>; + def ST4 : STRegister<"st(4)", [FP3]>, DwarfRegNum<[37, 16, 15]>; + def ST5 : STRegister<"st(5)", [FP2]>, DwarfRegNum<[38, 17, 16]>; + def ST6 : STRegister<"st(6)", [FP1]>, DwarfRegNum<[39, 18, 17]>; + def ST7 : STRegister<"st(7)", [FP0]>, DwarfRegNum<[40, 19, 18]>; + + // Status flags register + def EFLAGS : Register<"flags">; + + // Segment registers + def CS : Register<"cs">; + def DS : Register<"ds">; + def SS : Register<"ss">; + def ES : Register<"es">; + def FS : Register<"fs">; + def GS : Register<"gs">; + + // Debug registers + def DR0 : Register<"dr0">; + def DR1 : Register<"dr1">; + def DR2 : Register<"dr2">; + def DR3 : Register<"dr3">; + def DR4 : Register<"dr4">; + def DR5 : Register<"dr5">; + def DR6 : Register<"dr6">; + def DR7 : Register<"dr7">; + + // Control registers + def CR0 : Register<"cr0">; + def CR1 : Register<"cr1">; + def CR2 : Register<"cr2">; + def CR3 : Register<"cr3">; + def CR4 : Register<"cr4">; + def CR5 : Register<"cr5">; + def CR6 : Register<"cr6">; + def CR7 : Register<"cr7">; + def CR8 : Register<"cr8">; + def CR9 : Register<"cr9">; + def CR10 : Register<"cr10">; + def CR11 : Register<"cr11">; + def CR12 : Register<"cr12">; + def CR13 : Register<"cr13">; + def CR14 : Register<"cr14">; + def CR15 : Register<"cr15">; + + // Pseudo index registers + def EIZ : Register<"eiz">; + def RIZ : Register<"riz">; +} + + +//===----------------------------------------------------------------------===// +// Register Class Definitions... now that we have all of the pieces, define the +// top-level register classes. The order specified in the register list is +// implicitly defined to be the register allocation order. +// + +// List call-clobbered registers before callee-save registers. RBX, RBP, (and +// R12, R13, R14, and R15 for X86-64) are callee-save registers. +// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and +// R8B, ... R15B. +// Allocate R12 and R13 last, as these require an extra byte when +// encoded in x86_64 instructions. +// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in +// 64-bit mode. The main complication is that they cannot be encoded in an +// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc. +// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d" +// cannot be encoded. +def GR8 : RegisterClass<"X86", [i8], 8, + (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL, + R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> { + let AltOrders = [(sub GR8, AH, BH, CH, DH)]; + let AltOrderSelect = [{ + return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit(); + }]; +} + +def GR16 : RegisterClass<"X86", [i16], 16, + (add AX, CX, DX, SI, DI, BX, BP, SP, + R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)> { + let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi)]; +} + +def GR32 : RegisterClass<"X86", [i32], 32, + (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP, + R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)> { + let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; +} + +// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since +// RIP isn't really a register and it can't be used anywhere except in an +// address, but it doesn't cause trouble. +def GR64 : RegisterClass<"X86", [i64], 64, + (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, + RBX, R14, R15, R12, R13, RBP, RSP, RIP)> { + let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), + (GR16 sub_16bit), + (GR32 sub_32bit)]; +} + +// Segment registers for use by MOV instructions (and others) that have a +// segment register as one operand. Always contain a 16-bit segment +// descriptor. +def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>; + +// Debug registers. +def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 7)>; + +// Control registers. +def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>; + +// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of +// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d" +// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers +// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD, +// and GR64_ABCD are classes for registers that support 8-bit h-register +// operations. +def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>; +def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>; +def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)> { + let SubRegClasses = [(GR8_ABCD_L sub_8bit), (GR8_ABCD_H sub_8bit_hi)]; +} +def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)> { + let SubRegClasses = [(GR8_ABCD_L sub_8bit), + (GR8_ABCD_H sub_8bit_hi), + (GR16_ABCD sub_16bit)]; +} +def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)> { + let SubRegClasses = [(GR8_ABCD_L sub_8bit), + (GR8_ABCD_H sub_8bit_hi), + (GR16_ABCD sub_16bit), + (GR32_ABCD sub_32bit)]; +} +def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)> { + let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; +} +def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, + R8, R9, R11, RIP)> { + let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), + (GR16 sub_16bit), + (GR32_TC sub_32bit)]; +} + +def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, + R8, R9, R11)>; + +// GR8_NOREX - GR8 registers which do not require a REX prefix. +def GR8_NOREX : RegisterClass<"X86", [i8], 8, + (add AL, CL, DL, AH, CH, DH, BL, BH)> { + let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)]; + let AltOrderSelect = [{ + return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit(); + }]; +} +// GR16_NOREX - GR16 registers which do not require a REX prefix. +def GR16_NOREX : RegisterClass<"X86", [i16], 16, + (add AX, CX, DX, SI, DI, BX, BP, SP)> { + let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi)]; +} +// GR32_NOREX - GR32 registers which do not require a REX prefix. +def GR32_NOREX : RegisterClass<"X86", [i32], 32, + (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)> { + let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), + (GR16_NOREX sub_16bit)]; +} +// GR64_NOREX - GR64 registers which do not require a REX prefix. +def GR64_NOREX : RegisterClass<"X86", [i64], 64, + (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)> { + let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), + (GR16_NOREX sub_16bit), + (GR32_NOREX sub_32bit)]; +} + +// GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit +// mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs +// to clear upper 32-bits of RAX so is not a NOP. +def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)> { + let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; +} + +// GR32_NOSP - GR32 registers except ESP. +def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)> { + let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)]; +} + +// GR64_NOSP - GR64 registers except RSP (and RIP). +def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)> { + let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), + (GR16 sub_16bit), + (GR32_NOSP sub_32bit)]; +} + +// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except +// ESP. +def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32, + (and GR32_NOREX, GR32_NOSP)> { + let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), + (GR16_NOREX sub_16bit)]; +} + +// GR64_NOREX_NOSP - GR64_NOREX registers except RSP. +def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64, + (and GR64_NOREX, GR64_NOSP)> { + let SubRegClasses = [(GR8_NOREX sub_8bit, sub_8bit_hi), + (GR16_NOREX sub_16bit), + (GR32_NOREX_NOSP sub_32bit)]; +} + +// A class to support the 'A' assembler constraint: EAX then EDX. +def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)> { + let SubRegClasses = [(GR8_ABCD_L sub_8bit), + (GR8_ABCD_H sub_8bit_hi), + (GR16_ABCD sub_16bit)]; +} + +// Scalar SSE2 floating point registers. +def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; + +def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; + + +// FIXME: This sets up the floating point register files as though they are f64 +// values, though they really are f80 values. This will cause us to spill +// values as 64-bit quantities instead of 80-bit quantities, which is much much +// faster on common hardware. In reality, this should be controlled by a +// command line option or something. + +def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>; +def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>; +def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>; + +// Floating point stack registers (these are not allocatable by the +// register allocator - the floating point stackifier is responsible +// for transforming FPn allocations to STn registers) +def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> { + let isAllocatable = 0; +} + +// Generic vector registers: VR64 and VR128. +def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; +def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (add FR32)> { + let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd)]; +} + +def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + 256, (sequence "YMM%u", 0, 15)> { + let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd), (VR128 sub_xmm)]; +} + +// Status flags registers. +def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> { + let CopyCost = -1; // Don't allow copying of status registers. + let isAllocatable = 0; +} diff --git a/contrib/llvm/lib/Target/X86/X86Relocations.h b/contrib/llvm/lib/Target/X86/X86Relocations.h new file mode 100644 index 0000000..990962d --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86Relocations.h @@ -0,0 +1,52 @@ +//===- X86Relocations.h - X86 Code Relocations ------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86 target-specific relocation types. +// +//===----------------------------------------------------------------------===// + +#ifndef X86RELOCATIONS_H +#define X86RELOCATIONS_H + +#include "llvm/CodeGen/MachineRelocation.h" + +namespace llvm { + namespace X86 { + /// RelocationType - An enum for the x86 relocation codes. Note that + /// the terminology here doesn't follow x86 convention - word means + /// 32-bit and dword means 64-bit. The relocations will be treated + /// by JIT or ObjectCode emitters, this is transparent to the x86 code + /// emitter but JIT and ObjectCode will treat them differently + enum RelocationType { + /// reloc_pcrel_word - PC relative relocation, add the relocated value to + /// the value already in memory, after we adjust it for where the PC is. + reloc_pcrel_word = 0, + + /// reloc_picrel_word - PIC base relative relocation, add the relocated + /// value to the value already in memory, after we adjust it for where the + /// PIC base is. + reloc_picrel_word = 1, + + /// reloc_absolute_word - absolute relocation, just add the relocated + /// value to the value already in memory. + reloc_absolute_word = 2, + + /// reloc_absolute_word_sext - absolute relocation, just add the relocated + /// value to the value already in memory. In object files, it represents a + /// value which must be sign-extended when resolving the relocation. + reloc_absolute_word_sext = 3, + + /// reloc_absolute_dword - absolute relocation, just add the relocated + /// value to the value already in memory. + reloc_absolute_dword = 4 + }; + } +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp new file mode 100644 index 0000000..6406bce --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -0,0 +1,259 @@ +//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the X86SelectionDAGInfo class. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-selectiondag-info" +#include "X86TargetMachine.h" +#include "llvm/DerivedTypes.h" +#include "llvm/CodeGen/SelectionDAG.h" +using namespace llvm; + +X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) : + TargetSelectionDAGInfo(TM), + Subtarget(&TM.getSubtarget<X86Subtarget>()), + TLI(*TM.getTargetLowering()) { +} + +X86SelectionDAGInfo::~X86SelectionDAGInfo() { +} + +SDValue +X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, + MachinePointerInfo DstPtrInfo) const { + ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + + // If to a segment-relative address space, use the default lowering. + if (DstPtrInfo.getAddrSpace() >= 256) + return SDValue(); + + // If not DWORD aligned or size is more than the threshold, call the library. + // The libc version is likely to be faster for these cases. It can use the + // address value and run time information about the CPU. + if ((Align & 3) != 0 || + !ConstantSize || + ConstantSize->getZExtValue() > + Subtarget->getMaxInlineSizeThreshold()) { + SDValue InFlag(0, 0); + + // Check to see if there is a specialized entry-point for memory zeroing. + ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); + + if (const char *bzeroEntry = V && + V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { + EVT IntPtr = TLI.getPointerTy(); + Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext()); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Node = Dst; + Entry.Ty = IntPtrTy; + Args.push_back(Entry); + Entry.Node = Size; + Args.push_back(Entry); + std::pair<SDValue,SDValue> CallResult = + TLI.LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), + false, false, false, false, + 0, CallingConv::C, false, /*isReturnValueUsed=*/false, + DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, + DAG, dl); + return CallResult.second; + } + + // Otherwise have the target-independent code call memset. + return SDValue(); + } + + uint64_t SizeVal = ConstantSize->getZExtValue(); + SDValue InFlag(0, 0); + EVT AVT; + SDValue Count; + ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); + unsigned BytesLeft = 0; + bool TwoRepStos = false; + if (ValC) { + unsigned ValReg; + uint64_t Val = ValC->getZExtValue() & 255; + + // If the value is a constant, then we can potentially use larger sets. + switch (Align & 3) { + case 2: // WORD aligned + AVT = MVT::i16; + ValReg = X86::AX; + Val = (Val << 8) | Val; + break; + case 0: // DWORD aligned + AVT = MVT::i32; + ValReg = X86::EAX; + Val = (Val << 8) | Val; + Val = (Val << 16) | Val; + if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned + AVT = MVT::i64; + ValReg = X86::RAX; + Val = (Val << 32) | Val; + } + break; + default: // Byte aligned + AVT = MVT::i8; + ValReg = X86::AL; + Count = DAG.getIntPtrConstant(SizeVal); + break; + } + + if (AVT.bitsGT(MVT::i8)) { + unsigned UBytes = AVT.getSizeInBits() / 8; + Count = DAG.getIntPtrConstant(SizeVal / UBytes); + BytesLeft = SizeVal % UBytes; + } + + Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), + InFlag); + InFlag = Chain.getValue(1); + } else { + AVT = MVT::i8; + Count = DAG.getIntPtrConstant(SizeVal); + Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); + InFlag = Chain.getValue(1); + } + + Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : + X86::ECX, + Count, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : + X86::EDI, + Dst, InFlag); + InFlag = Chain.getValue(1); + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; + Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); + + if (TwoRepStos) { + InFlag = Chain.getValue(1); + Count = Size; + EVT CVT = Count.getValueType(); + SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, + DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); + Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : + X86::ECX, + Left, InFlag); + InFlag = Chain.getValue(1); + Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; + Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); + } else if (BytesLeft) { + // Handle the last 1 - 7 bytes. + unsigned Offset = SizeVal - BytesLeft; + EVT AddrVT = Dst.getValueType(); + EVT SizeVT = Size.getValueType(); + + Chain = DAG.getMemset(Chain, dl, + DAG.getNode(ISD::ADD, dl, AddrVT, Dst, + DAG.getConstant(Offset, AddrVT)), + Src, + DAG.getConstant(BytesLeft, SizeVT), + Align, isVolatile, DstPtrInfo.getWithOffset(Offset)); + } + + // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. + return Chain; +} + +SDValue +X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const { + // This requires the copy size to be a constant, preferably + // within a subtarget-specific limit. + ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + if (!ConstantSize) + return SDValue(); + uint64_t SizeVal = ConstantSize->getZExtValue(); + if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold()) + return SDValue(); + + /// If not DWORD aligned, it is more efficient to call the library. However + /// if calling the library is not allowed (AlwaysInline), then soldier on as + /// the code generated here is better than the long load-store sequence we + /// would otherwise get. + if (!AlwaysInline && (Align & 3) != 0) + return SDValue(); + + // If to a segment-relative address space, use the default lowering. + if (DstPtrInfo.getAddrSpace() >= 256 || + SrcPtrInfo.getAddrSpace() >= 256) + return SDValue(); + + MVT AVT; + if (Align & 1) + AVT = MVT::i8; + else if (Align & 2) + AVT = MVT::i16; + else if (Align & 4) + // DWORD aligned + AVT = MVT::i32; + else + // QWORD aligned + AVT = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; + + unsigned UBytes = AVT.getSizeInBits() / 8; + unsigned CountVal = SizeVal / UBytes; + SDValue Count = DAG.getIntPtrConstant(CountVal); + unsigned BytesLeft = SizeVal % UBytes; + + SDValue InFlag(0, 0); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : + X86::ECX, + Count, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : + X86::EDI, + Dst, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : + X86::ESI, + Src, InFlag); + InFlag = Chain.getValue(1); + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; + SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops, + array_lengthof(Ops)); + + SmallVector<SDValue, 4> Results; + Results.push_back(RepMovs); + if (BytesLeft) { + // Handle the last 1 - 7 bytes. + unsigned Offset = SizeVal - BytesLeft; + EVT DstVT = Dst.getValueType(); + EVT SrcVT = Src.getValueType(); + EVT SizeVT = Size.getValueType(); + Results.push_back(DAG.getMemcpy(Chain, dl, + DAG.getNode(ISD::ADD, dl, DstVT, Dst, + DAG.getConstant(Offset, DstVT)), + DAG.getNode(ISD::ADD, dl, SrcVT, Src, + DAG.getConstant(Offset, SrcVT)), + DAG.getConstant(BytesLeft, SizeVT), + Align, isVolatile, AlwaysInline, + DstPtrInfo.getWithOffset(Offset), + SrcPtrInfo.getWithOffset(Offset))); + } + + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + &Results[0], Results.size()); +} diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h new file mode 100644 index 0000000..d1d66fe --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h @@ -0,0 +1,56 @@ +//===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86 subclass for TargetSelectionDAGInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef X86SELECTIONDAGINFO_H +#define X86SELECTIONDAGINFO_H + +#include "llvm/Target/TargetSelectionDAGInfo.h" + +namespace llvm { + +class X86TargetLowering; +class X86TargetMachine; +class X86Subtarget; + +class X86SelectionDAGInfo : public TargetSelectionDAGInfo { + /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// make the right decision when generating code for different targets. + const X86Subtarget *Subtarget; + + const X86TargetLowering &TLI; + +public: + explicit X86SelectionDAGInfo(const X86TargetMachine &TM); + ~X86SelectionDAGInfo(); + + virtual + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, + MachinePointerInfo DstPtrInfo) const; + + virtual + SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, + SDValue Chain, + SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, + bool isVolatile, bool AlwaysInline, + MachinePointerInfo DstPtrInfo, + MachinePointerInfo SrcPtrInfo) const; +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp new file mode 100644 index 0000000..7064dd0 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp @@ -0,0 +1,373 @@ +//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the X86 specific subclass of TargetSubtargetInfo. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "subtarget" +#include "X86Subtarget.h" +#include "X86InstrInfo.h" +#include "llvm/GlobalValue.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Host.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/ADT/SmallVector.h" + +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#include "X86GenSubtargetInfo.inc" + +using namespace llvm; + +#if defined(_MSC_VER) +#include <intrin.h> +#endif + +/// ClassifyBlockAddressReference - Classify a blockaddress reference for the +/// current subtarget according to how we should reference it in a non-pcrel +/// context. +unsigned char X86Subtarget:: +ClassifyBlockAddressReference() const { + if (isPICStyleGOT()) // 32-bit ELF targets. + return X86II::MO_GOTOFF; + + if (isPICStyleStubPIC()) // Darwin/32 in PIC mode. + return X86II::MO_PIC_BASE_OFFSET; + + // Direct static reference to label. + return X86II::MO_NO_FLAG; +} + +/// ClassifyGlobalReference - Classify a global variable reference for the +/// current subtarget according to how we should reference it in a non-pcrel +/// context. +unsigned char X86Subtarget:: +ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { + // DLLImport only exists on windows, it is implemented as a load from a + // DLLIMPORT stub. + if (GV->hasDLLImportLinkage()) + return X86II::MO_DLLIMPORT; + + // Determine whether this is a reference to a definition or a declaration. + // Materializable GVs (in JIT lazy compilation mode) do not require an extra + // load from stub. + bool isDecl = GV->hasAvailableExternallyLinkage(); + if (GV->isDeclaration() && !GV->isMaterializable()) + isDecl = true; + + // X86-64 in PIC mode. + if (isPICStyleRIPRel()) { + // Large model never uses stubs. + if (TM.getCodeModel() == CodeModel::Large) + return X86II::MO_NO_FLAG; + + if (isTargetDarwin()) { + // If symbol visibility is hidden, the extra load is not needed if + // target is x86-64 or the symbol is definitely defined in the current + // translation unit. + if (GV->hasDefaultVisibility() && + (isDecl || GV->isWeakForLinker())) + return X86II::MO_GOTPCREL; + } else if (!isTargetWin64()) { + assert(isTargetELF() && "Unknown rip-relative target"); + + // Extra load is needed for all externally visible. + if (!GV->hasLocalLinkage() && GV->hasDefaultVisibility()) + return X86II::MO_GOTPCREL; + } + + return X86II::MO_NO_FLAG; + } + + if (isPICStyleGOT()) { // 32-bit ELF targets. + // Extra load is needed for all externally visible. + if (GV->hasLocalLinkage() || GV->hasHiddenVisibility()) + return X86II::MO_GOTOFF; + return X86II::MO_GOT; + } + + if (isPICStyleStubPIC()) { // Darwin/32 in PIC mode. + // Determine whether we have a stub reference and/or whether the reference + // is relative to the PIC base or not. + + // If this is a strong reference to a definition, it is definitely not + // through a stub. + if (!isDecl && !GV->isWeakForLinker()) + return X86II::MO_PIC_BASE_OFFSET; + + // Unless we have a symbol with hidden visibility, we have to go through a + // normal $non_lazy_ptr stub because this symbol might be resolved late. + if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. + return X86II::MO_DARWIN_NONLAZY_PIC_BASE; + + // If symbol visibility is hidden, we have a stub for common symbol + // references and external declarations. + if (isDecl || GV->hasCommonLinkage()) { + // Hidden $non_lazy_ptr reference. + return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE; + } + + // Otherwise, no stub. + return X86II::MO_PIC_BASE_OFFSET; + } + + if (isPICStyleStubNoDynamic()) { // Darwin/32 in -mdynamic-no-pic mode. + // Determine whether we have a stub reference. + + // If this is a strong reference to a definition, it is definitely not + // through a stub. + if (!isDecl && !GV->isWeakForLinker()) + return X86II::MO_NO_FLAG; + + // Unless we have a symbol with hidden visibility, we have to go through a + // normal $non_lazy_ptr stub because this symbol might be resolved late. + if (!GV->hasHiddenVisibility()) // Non-hidden $non_lazy_ptr reference. + return X86II::MO_DARWIN_NONLAZY; + + // Otherwise, no stub. + return X86II::MO_NO_FLAG; + } + + // Direct static reference to global. + return X86II::MO_NO_FLAG; +} + + +/// getBZeroEntry - This function returns the name of a function which has an +/// interface like the non-standard bzero function, if such a function exists on +/// the current subtarget and it is considered prefereable over memset with zero +/// passed as the second argument. Otherwise it returns null. +const char *X86Subtarget::getBZeroEntry() const { + // Darwin 10 has a __bzero entry point for this purpose. + if (getTargetTriple().isMacOSX() && + !getTargetTriple().isMacOSXVersionLT(10, 6)) + return "__bzero"; + + return 0; +} + +/// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls +/// to immediate address. +bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const { + if (In64BitMode) + return false; + return isTargetELF() || TM.getRelocationModel() == Reloc::Static; +} + +/// getSpecialAddressLatency - For targets where it is beneficial to +/// backschedule instructions that compute addresses, return a value +/// indicating the number of scheduling cycles of backscheduling that +/// should be attempted. +unsigned X86Subtarget::getSpecialAddressLatency() const { + // For x86 out-of-order targets, back-schedule address computations so + // that loads and stores aren't blocked. + // This value was chosen arbitrarily. + return 200; +} + +void X86Subtarget::AutoDetectSubtargetFeatures() { + unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0; + union { + unsigned u[3]; + char c[12]; + } text; + + if (X86_MC::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1)) + return; + + X86_MC::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX); + + if ((EDX >> 15) & 1) { HasCMov = true; ToggleFeature(X86::FeatureCMOV); } + if ((EDX >> 23) & 1) { X86SSELevel = MMX; ToggleFeature(X86::FeatureMMX); } + if ((EDX >> 25) & 1) { X86SSELevel = SSE1; ToggleFeature(X86::FeatureSSE1); } + if ((EDX >> 26) & 1) { X86SSELevel = SSE2; ToggleFeature(X86::FeatureSSE2); } + if (ECX & 0x1) { X86SSELevel = SSE3; ToggleFeature(X86::FeatureSSE3); } + if ((ECX >> 9) & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);} + if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);} + if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);} + // FIXME: AVX codegen support is not ready. + //if ((ECX >> 28) & 1) { HasAVX = true; ToggleFeature(X86::FeatureAVX); } + + bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0; + bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0; + + if (IsIntel && ((ECX >> 1) & 0x1)) { + HasCLMUL = true; + ToggleFeature(X86::FeatureCLMUL); + } + if (IsIntel && ((ECX >> 12) & 0x1)) { + HasFMA3 = true; + ToggleFeature(X86::FeatureFMA3); + } + if (IsIntel && ((ECX >> 22) & 0x1)) { + HasMOVBE = true; + ToggleFeature(X86::FeatureMOVBE); + } + if (IsIntel && ((ECX >> 23) & 0x1)) { + HasPOPCNT = true; + ToggleFeature(X86::FeaturePOPCNT); + } + if (IsIntel && ((ECX >> 25) & 0x1)) { + HasAES = true; + ToggleFeature(X86::FeatureAES); + } + if (IsIntel && ((ECX >> 29) & 0x1)) { + HasF16C = true; + ToggleFeature(X86::FeatureF16C); + } + if (IsIntel && ((ECX >> 30) & 0x1)) { + HasRDRAND = true; + ToggleFeature(X86::FeatureRDRAND); + } + + if ((ECX >> 13) & 0x1) { + HasCmpxchg16b = true; + ToggleFeature(X86::FeatureCMPXCHG16B); + } + + if (IsIntel || IsAMD) { + // Determine if bit test memory instructions are slow. + unsigned Family = 0; + unsigned Model = 0; + X86_MC::DetectFamilyModel(EAX, Family, Model); + if (IsAMD || (Family == 6 && Model >= 13)) { + IsBTMemSlow = true; + ToggleFeature(X86::FeatureSlowBTMem); + } + // If it's Nehalem, unaligned memory access is fast. + if (Family == 15 && Model == 26) { + IsUAMemFast = true; + ToggleFeature(X86::FeatureFastUAMem); + } + + X86_MC::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX); + if ((EDX >> 29) & 0x1) { + HasX86_64 = true; + ToggleFeature(X86::Feature64Bit); + } + if ((ECX >> 5) & 0x1) { + HasLZCNT = true; + ToggleFeature(X86::FeatureLZCNT); + } + if (IsAMD && ((ECX >> 6) & 0x1)) { + HasSSE4A = true; + ToggleFeature(X86::FeatureSSE4A); + } + if (IsAMD && ((ECX >> 16) & 0x1)) { + HasFMA4 = true; + ToggleFeature(X86::FeatureFMA4); + } + } +} + +X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, + unsigned StackAlignOverride, bool is64Bit) + : X86GenSubtargetInfo(TT, CPU, FS) + , PICStyle(PICStyles::None) + , X86SSELevel(NoMMXSSE) + , X863DNowLevel(NoThreeDNow) + , HasCMov(false) + , HasX86_64(false) + , HasPOPCNT(false) + , HasSSE4A(false) + , HasAVX(false) + , HasAES(false) + , HasCLMUL(false) + , HasFMA3(false) + , HasFMA4(false) + , HasMOVBE(false) + , HasRDRAND(false) + , HasF16C(false) + , HasLZCNT(false) + , HasBMI(false) + , IsBTMemSlow(false) + , IsUAMemFast(false) + , HasVectorUAMem(false) + , HasCmpxchg16b(false) + , stackAlignment(8) + // FIXME: this is a known good value for Yonah. How about others? + , MaxInlineSizeThreshold(128) + , TargetTriple(TT) + , In64BitMode(is64Bit) + , InNaClMode(false) { + // Determine default and user specified characteristics + if (!FS.empty() || !CPU.empty()) { + std::string CPUName = CPU; + if (CPUName.empty()) { +#if defined (__x86_64__) || defined(__i386__) + CPUName = sys::getHostCPUName(); +#else + CPUName = "generic"; +#endif + } + + // Make sure 64-bit features are available in 64-bit mode. (But make sure + // SSE2 can be turned off explicitly.) + std::string FullFS = FS; + if (In64BitMode) { + if (!FullFS.empty()) + FullFS = "+64bit,+sse2," + FullFS; + else + FullFS = "+64bit,+sse2"; + } + + // If feature string is not empty, parse features string. + ParseSubtargetFeatures(CPUName, FullFS); + } else { + // Otherwise, use CPUID to auto-detect feature set. + AutoDetectSubtargetFeatures(); + + // Make sure 64-bit features are available in 64-bit mode. + if (In64BitMode) { + HasX86_64 = true; ToggleFeature(X86::Feature64Bit); + HasCMov = true; ToggleFeature(X86::FeatureCMOV); + + if (!HasAVX && X86SSELevel < SSE2) { + X86SSELevel = SSE2; + ToggleFeature(X86::FeatureSSE1); + ToggleFeature(X86::FeatureSSE2); + } + } + } + + // It's important to keep the MCSubtargetInfo feature bits in sync with + // target data structure which is shared with MC code emitter, etc. + if (In64BitMode) + ToggleFeature(X86::Mode64Bit); + + if (isTargetNaCl()) { + InNaClMode = true; + ToggleFeature(X86::ModeNaCl); + } + + if (HasAVX) + X86SSELevel = NoMMXSSE; + + DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel + << ", 3DNowLevel " << X863DNowLevel + << ", 64bit " << HasX86_64 << "\n"); + assert((!In64BitMode || HasX86_64) && + "64-bit code requested on a subtarget that doesn't support it!"); + + if(EnableSegmentedStacks && !isTargetELF()) + report_fatal_error("Segmented stacks are only implemented on ELF."); + + // Stack alignment is 16 bytes on Darwin, FreeBSD, Linux and Solaris (both + // 32 and 64 bit) and for all 64-bit targets. + if (StackAlignOverride) + stackAlignment = StackAlignOverride; + else if (isTargetDarwin() || isTargetFreeBSD() || isTargetLinux() || + isTargetSolaris() || In64BitMode) + stackAlignment = 16; +} diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h new file mode 100644 index 0000000..3258d3d --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -0,0 +1,293 @@ +//=====---- X86Subtarget.h - Define Subtarget for the X86 -----*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the X86 specific subclass of TargetSubtargetInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef X86SUBTARGET_H +#define X86SUBTARGET_H + +#include "llvm/ADT/Triple.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/CallingConv.h" +#include <string> + +#define GET_SUBTARGETINFO_HEADER +#include "X86GenSubtargetInfo.inc" + +namespace llvm { +class GlobalValue; +class StringRef; +class TargetMachine; + +/// PICStyles - The X86 backend supports a number of different styles of PIC. +/// +namespace PICStyles { +enum Style { + StubPIC, // Used on i386-darwin in -fPIC mode. + StubDynamicNoPIC, // Used on i386-darwin in -mdynamic-no-pic mode. + GOT, // Used on many 32-bit unices in -fPIC mode. + RIPRel, // Used on X86-64 when not in -static mode. + None // Set when in -static mode (not PIC or DynamicNoPIC mode). +}; +} + +class X86Subtarget : public X86GenSubtargetInfo { +protected: + enum X86SSEEnum { + NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42 + }; + + enum X863DNowEnum { + NoThreeDNow, ThreeDNow, ThreeDNowA + }; + + /// PICStyle - Which PIC style to use + /// + PICStyles::Style PICStyle; + + /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or + /// none supported. + X86SSEEnum X86SSELevel; + + /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported. + /// + X863DNowEnum X863DNowLevel; + + /// HasCMov - True if this processor has conditional move instructions + /// (generally pentium pro+). + bool HasCMov; + + /// HasX86_64 - True if the processor supports X86-64 instructions. + /// + bool HasX86_64; + + /// HasPOPCNT - True if the processor supports POPCNT. + bool HasPOPCNT; + + /// HasSSE4A - True if the processor supports SSE4A instructions. + bool HasSSE4A; + + /// HasAVX - Target has AVX instructions + bool HasAVX; + + /// HasAES - Target has AES instructions + bool HasAES; + + /// HasCLMUL - Target has carry-less multiplication + bool HasCLMUL; + + /// HasFMA3 - Target has 3-operand fused multiply-add + bool HasFMA3; + + /// HasFMA4 - Target has 4-operand fused multiply-add + bool HasFMA4; + + /// HasMOVBE - True if the processor has the MOVBE instruction. + bool HasMOVBE; + + /// HasRDRAND - True if the processor has the RDRAND instruction. + bool HasRDRAND; + + /// HasF16C - Processor has 16-bit floating point conversion instructions. + bool HasF16C; + + /// HasLZCNT - Processor has LZCNT instruction. + bool HasLZCNT; + + /// HasBMI - Processor has BMI1 instructions. + bool HasBMI; + + /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow. + bool IsBTMemSlow; + + /// IsUAMemFast - True if unaligned memory access is fast. + bool IsUAMemFast; + + /// HasVectorUAMem - True if SIMD operations can have unaligned memory + /// operands. This may require setting a feature bit in the processor. + bool HasVectorUAMem; + + /// HasCmpxchg16b - True if this processor has the CMPXCHG16B instruction; + /// this is true for most x86-64 chips, but not the first AMD chips. + bool HasCmpxchg16b; + + /// stackAlignment - The minimum alignment known to hold of the stack frame on + /// entry to the function and which must be maintained by every function. + unsigned stackAlignment; + + /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops. + /// + unsigned MaxInlineSizeThreshold; + + /// TargetTriple - What processor and OS we're targeting. + Triple TargetTriple; + +private: + /// In64BitMode - True if compiling for 64-bit, false for 32-bit. + bool In64BitMode; + + /// InNaClMode - True if compiling for Native Client target. + bool InNaClMode; + +public: + + /// This constructor initializes the data members to match that + /// of the specified triple. + /// + X86Subtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, + unsigned StackAlignOverride, bool is64Bit); + + /// getStackAlignment - Returns the minimum alignment known to hold of the + /// stack frame on entry to the function and which must be maintained by every + /// function for this subtarget. + unsigned getStackAlignment() const { return stackAlignment; } + + /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size + /// that still makes it profitable to inline the call. + unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; } + + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID + /// instruction. + void AutoDetectSubtargetFeatures(); + + bool is64Bit() const { return In64BitMode; } + + PICStyles::Style getPICStyle() const { return PICStyle; } + void setPICStyle(PICStyles::Style Style) { PICStyle = Style; } + + bool hasCMov() const { return HasCMov; } + bool hasMMX() const { return X86SSELevel >= MMX; } + bool hasSSE1() const { return X86SSELevel >= SSE1; } + bool hasSSE2() const { return X86SSELevel >= SSE2; } + bool hasSSE3() const { return X86SSELevel >= SSE3; } + bool hasSSSE3() const { return X86SSELevel >= SSSE3; } + bool hasSSE41() const { return X86SSELevel >= SSE41; } + bool hasSSE42() const { return X86SSELevel >= SSE42; } + bool hasSSE4A() const { return HasSSE4A; } + bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } + bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } + bool hasPOPCNT() const { return HasPOPCNT; } + bool hasAVX() const { return HasAVX; } + bool hasXMM() const { return hasSSE1() || hasAVX(); } + bool hasXMMInt() const { return hasSSE2() || hasAVX(); } + bool hasAES() const { return HasAES; } + bool hasCLMUL() const { return HasCLMUL; } + bool hasFMA3() const { return HasFMA3; } + bool hasFMA4() const { return HasFMA4; } + bool hasMOVBE() const { return HasMOVBE; } + bool hasRDRAND() const { return HasRDRAND; } + bool hasF16C() const { return HasF16C; } + bool hasLZCNT() const { return HasLZCNT; } + bool hasBMI() const { return HasBMI; } + bool isBTMemSlow() const { return IsBTMemSlow; } + bool isUnalignedMemAccessFast() const { return IsUAMemFast; } + bool hasVectorUAMem() const { return HasVectorUAMem; } + bool hasCmpxchg16b() const { return HasCmpxchg16b; } + + const Triple &getTargetTriple() const { return TargetTriple; } + + bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } + bool isTargetFreeBSD() const { + return TargetTriple.getOS() == Triple::FreeBSD; + } + bool isTargetSolaris() const { + return TargetTriple.getOS() == Triple::Solaris; + } + + // ELF is a reasonably sane default and the only other X86 targets we + // support are Darwin and Windows. Just use "not those". + bool isTargetELF() const { + return !isTargetDarwin() && !isTargetWindows() && !isTargetCygMing(); + } + bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; } + bool isTargetNaCl() const { + return TargetTriple.getOS() == Triple::NativeClient; + } + bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); } + bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); } + + bool isTargetWindows() const { return TargetTriple.getOS() == Triple::Win32; } + bool isTargetMingw() const { return TargetTriple.getOS() == Triple::MinGW32; } + bool isTargetCygwin() const { return TargetTriple.getOS() == Triple::Cygwin; } + bool isTargetCygMing() const { + return isTargetMingw() || isTargetCygwin(); + } + + /// isTargetCOFF - Return true if this is any COFF/Windows target variant. + bool isTargetCOFF() const { + return isTargetMingw() || isTargetCygwin() || isTargetWindows(); + } + + bool isTargetWin64() const { + // FIXME: x86_64-cygwin has not been released yet. + return In64BitMode && (isTargetCygMing() || isTargetWindows()); + } + + bool isTargetEnvMacho() const { + return isTargetDarwin() || (TargetTriple.getEnvironment() == Triple::MachO); + } + + bool isTargetWin32() const { + return !In64BitMode && (isTargetMingw() || isTargetWindows()); + } + + bool isPICStyleSet() const { return PICStyle != PICStyles::None; } + bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; } + bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; } + + bool isPICStyleStubPIC() const { + return PICStyle == PICStyles::StubPIC; + } + + bool isPICStyleStubNoDynamic() const { + return PICStyle == PICStyles::StubDynamicNoPIC; + } + bool isPICStyleStubAny() const { + return PICStyle == PICStyles::StubDynamicNoPIC || + PICStyle == PICStyles::StubPIC; } + + /// ClassifyGlobalReference - Classify a global variable reference for the + /// current subtarget according to how we should reference it in a non-pcrel + /// context. + unsigned char ClassifyGlobalReference(const GlobalValue *GV, + const TargetMachine &TM)const; + + /// ClassifyBlockAddressReference - Classify a blockaddress reference for the + /// current subtarget according to how we should reference it in a non-pcrel + /// context. + unsigned char ClassifyBlockAddressReference() const; + + /// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls + /// to immediate address. + bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const; + + /// This function returns the name of a function which has an interface + /// like the non-standard bzero function, if such a function exists on + /// the current subtarget and it is considered prefereable over + /// memset with zero passed as the second argument. Otherwise it + /// returns null. + const char *getBZeroEntry() const; + + /// getSpecialAddressLatency - For targets where it is beneficial to + /// backschedule instructions that compute addresses, return a value + /// indicating the number of scheduling cycles of backscheduling that + /// should be attempted. + unsigned getSpecialAddressLatency() const; +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp new file mode 100644 index 0000000..15c6c4e --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -0,0 +1,158 @@ +//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the X86 specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#include "X86TargetMachine.h" +#include "X86.h" +#include "llvm/PassManager.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +extern "C" void LLVMInitializeX86Target() { + // Register the target. + RegisterTargetMachine<X86_32TargetMachine> X(TheX86_32Target); + RegisterTargetMachine<X86_64TargetMachine> Y(TheX86_64Target); +} + + +X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + Reloc::Model RM, CodeModel::Model CM) + : X86TargetMachine(T, TT, CPU, FS, RM, CM, false), + DataLayout(getSubtargetImpl()->isTargetDarwin() ? + "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-" + "n8:16:32-S128" : + (getSubtargetImpl()->isTargetCygMing() || + getSubtargetImpl()->isTargetWindows()) ? + "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-f128:128:128-" + "n8:16:32-S32" : + "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-f128:128:128-" + "n8:16:32-S128"), + InstrInfo(*this), + TSInfo(*this), + TLInfo(*this), + JITInfo(*this) { +} + + +X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + Reloc::Model RM, CodeModel::Model CM) + : X86TargetMachine(T, TT, CPU, FS, RM, CM, true), + DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-" + "n8:16:32:64-S128"), + InstrInfo(*this), + TSInfo(*this), + TLInfo(*this), + JITInfo(*this) { +} + +/// X86TargetMachine ctor - Create an X86 target. +/// +X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + Reloc::Model RM, CodeModel::Model CM, + bool is64Bit) + : LLVMTargetMachine(T, TT, CPU, FS, RM, CM), + Subtarget(TT, CPU, FS, StackAlignmentOverride, is64Bit), + FrameLowering(*this, Subtarget), + ELFWriterInfo(is64Bit, true) { + // Determine the PICStyle based on the target selected. + if (getRelocationModel() == Reloc::Static) { + // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None. + Subtarget.setPICStyle(PICStyles::None); + } else if (Subtarget.is64Bit()) { + // PIC in 64 bit mode is always rip-rel. + Subtarget.setPICStyle(PICStyles::RIPRel); + } else if (Subtarget.isTargetCygMing()) { + Subtarget.setPICStyle(PICStyles::None); + } else if (Subtarget.isTargetDarwin()) { + if (getRelocationModel() == Reloc::PIC_) + Subtarget.setPICStyle(PICStyles::StubPIC); + else { + assert(getRelocationModel() == Reloc::DynamicNoPIC); + Subtarget.setPICStyle(PICStyles::StubDynamicNoPIC); + } + } else if (Subtarget.isTargetELF()) { + Subtarget.setPICStyle(PICStyles::GOT); + } + + // default to hard float ABI + if (FloatABIType == FloatABI::Default) + FloatABIType = FloatABI::Hard; +} + +//===----------------------------------------------------------------------===// +// Command line options for x86 +//===----------------------------------------------------------------------===// +static cl::opt<bool> +UseVZeroUpper("x86-use-vzeroupper", + cl::desc("Minimize AVX to SSE transition penalty"), + cl::init(false)); + +//===----------------------------------------------------------------------===// +// Pass Pipeline Configuration +//===----------------------------------------------------------------------===// + +bool X86TargetMachine::addInstSelector(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + // Install an instruction selector. + PM.add(createX86ISelDag(*this, OptLevel)); + + // For 32-bit, prepend instructions to set the "global base reg" for PIC. + if (!Subtarget.is64Bit()) + PM.add(createGlobalBaseRegPass()); + + return false; +} + +bool X86TargetMachine::addPreRegAlloc(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + PM.add(createX86MaxStackAlignmentHeuristicPass()); + return false; // -print-machineinstr shouldn't print after this. +} + +bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + PM.add(createX86FloatingPointStackifierPass()); + return true; // -print-machineinstr should print after this. +} + +bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + bool ShouldPrint = false; + if (OptLevel != CodeGenOpt::None && + (Subtarget.hasSSE2() || Subtarget.hasAVX())) { + PM.add(createExecutionDependencyFixPass(&X86::VR128RegClass)); + ShouldPrint = true; + } + + if (Subtarget.hasAVX() && UseVZeroUpper) { + PM.add(createX86IssueVZeroUpperPass()); + ShouldPrint = true; + } + + return ShouldPrint; +} + +bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM, + CodeGenOpt::Level OptLevel, + JITCodeEmitter &JCE) { + PM.add(createX86JITCodeEmitterPass(*this, JCE)); + + return false; +} diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h new file mode 100644 index 0000000..d1569aa --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h @@ -0,0 +1,133 @@ +//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the X86 specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef X86TARGETMACHINE_H +#define X86TARGETMACHINE_H + +#include "X86.h" +#include "X86ELFWriterInfo.h" +#include "X86InstrInfo.h" +#include "X86ISelLowering.h" +#include "X86FrameLowering.h" +#include "X86JITInfo.h" +#include "X86SelectionDAGInfo.h" +#include "X86Subtarget.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + +class formatted_raw_ostream; +class StringRef; + +class X86TargetMachine : public LLVMTargetMachine { + X86Subtarget Subtarget; + X86FrameLowering FrameLowering; + X86ELFWriterInfo ELFWriterInfo; + +public: + X86TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + Reloc::Model RM, CodeModel::Model CM, + bool is64Bit); + + virtual const X86InstrInfo *getInstrInfo() const { + llvm_unreachable("getInstrInfo not implemented"); + } + virtual const TargetFrameLowering *getFrameLowering() const { + return &FrameLowering; + } + virtual X86JITInfo *getJITInfo() { + llvm_unreachable("getJITInfo not implemented"); + } + virtual const X86Subtarget *getSubtargetImpl() const{ return &Subtarget; } + virtual const X86TargetLowering *getTargetLowering() const { + llvm_unreachable("getTargetLowering not implemented"); + } + virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const { + llvm_unreachable("getSelectionDAGInfo not implemented"); + } + virtual const X86RegisterInfo *getRegisterInfo() const { + return &getInstrInfo()->getRegisterInfo(); + } + virtual const X86ELFWriterInfo *getELFWriterInfo() const { + return Subtarget.isTargetELF() ? &ELFWriterInfo : 0; + } + + // Set up the pass pipeline. + virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, + JITCodeEmitter &JCE); +}; + +/// X86_32TargetMachine - X86 32-bit target machine. +/// +class X86_32TargetMachine : public X86TargetMachine { + const TargetData DataLayout; // Calculates type size & alignment + X86InstrInfo InstrInfo; + X86SelectionDAGInfo TSInfo; + X86TargetLowering TLInfo; + X86JITInfo JITInfo; +public: + X86_32TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + Reloc::Model RM, CodeModel::Model CM); + virtual const TargetData *getTargetData() const { return &DataLayout; } + virtual const X86TargetLowering *getTargetLowering() const { + return &TLInfo; + } + virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const { + return &TSInfo; + } + virtual const X86InstrInfo *getInstrInfo() const { + return &InstrInfo; + } + virtual X86JITInfo *getJITInfo() { + return &JITInfo; + } +}; + +/// X86_64TargetMachine - X86 64-bit target machine. +/// +class X86_64TargetMachine : public X86TargetMachine { + const TargetData DataLayout; // Calculates type size & alignment + X86InstrInfo InstrInfo; + X86SelectionDAGInfo TSInfo; + X86TargetLowering TLInfo; + X86JITInfo JITInfo; +public: + X86_64TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + Reloc::Model RM, CodeModel::Model CM); + virtual const TargetData *getTargetData() const { return &DataLayout; } + virtual const X86TargetLowering *getTargetLowering() const { + return &TLInfo; + } + virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const { + return &TSInfo; + } + virtual const X86InstrInfo *getInstrInfo() const { + return &InstrInfo; + } + virtual X86JITInfo *getJITInfo() { + return &JITInfo; + } +}; + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp new file mode 100644 index 0000000..991f322 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp @@ -0,0 +1,45 @@ +//===-- llvm/Target/X86/X86TargetObjectFile.cpp - X86 Object Info ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "X86TargetObjectFile.h" +#include "X86TargetMachine.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/Target/Mangler.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/Dwarf.h" +using namespace llvm; +using namespace dwarf; + +const MCExpr *X8664_MachoTargetObjectFile:: +getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI, unsigned Encoding, + MCStreamer &Streamer) const { + + // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which + // is an indirect pc-relative reference. + if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) { + const MCSymbol *Sym = Mang->getSymbol(GV); + const MCExpr *Res = + MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext()); + const MCExpr *Four = MCConstantExpr::Create(4, getContext()); + return MCBinaryExpr::CreateAdd(Res, Four, getContext()); + } + + return TargetLoweringObjectFileMachO:: + getExprForDwarfGlobalReference(GV, Mang, MMI, Encoding, Streamer); +} + +MCSymbol *X8664_MachoTargetObjectFile:: +getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI) const { + return Mang->getSymbol(GV); +} diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h new file mode 100644 index 0000000..d7adf27 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h @@ -0,0 +1,38 @@ +//===-- llvm/Target/X86/X86TargetObjectFile.h - X86 Object Info -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_X86_TARGETOBJECTFILE_H +#define LLVM_TARGET_X86_TARGETOBJECTFILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetLoweringObjectFile.h" + +namespace llvm { + class X86TargetMachine; + + /// X8664_MachoTargetObjectFile - This TLOF implementation is used for Darwin + /// x86-64. + class X8664_MachoTargetObjectFile : public TargetLoweringObjectFileMachO { + public: + virtual const MCExpr * + getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI, unsigned Encoding, + MCStreamer &Streamer) const; + + // getCFIPersonalitySymbol - The symbol that gets passed to + // .cfi_personality. + virtual MCSymbol * + getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang, + MachineModuleInfo *MMI) const; + }; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp new file mode 100644 index 0000000..3958494 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -0,0 +1,105 @@ +//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which inserts x86 AVX vzeroupper instructions +// before calls to SSE encoded functions. This avoids transition latency +// penalty when tranfering control between AVX encoded instructions and old +// SSE encoding mode. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "x86-codegen" +#include "X86.h" +#include "X86InstrInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/GlobalValue.h" +#include "llvm/Target/TargetInstrInfo.h" +using namespace llvm; + +STATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); + +namespace { + struct VZeroUpperInserter : public MachineFunctionPass { + static char ID; + VZeroUpperInserter() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &MF); + + bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); + + virtual const char *getPassName() const { return "X86 vzeroupper inserter";} + + private: + const TargetInstrInfo *TII; // Machine instruction info. + MachineBasicBlock *MBB; // Current basic block + }; + char VZeroUpperInserter::ID = 0; +} + +FunctionPass *llvm::createX86IssueVZeroUpperPass() { + return new VZeroUpperInserter(); +} + +/// runOnMachineFunction - Loop over all of the basic blocks, inserting +/// vzero upper instructions before function calls. +bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { + TII = MF.getTarget().getInstrInfo(); + bool Changed = false; + + // Process any unreachable blocks in arbitrary order now. + for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) + Changed |= processBasicBlock(MF, *BB); + + return Changed; +} + +static bool isCallToModuleFn(const MachineInstr *MI) { + assert(MI->getDesc().isCall() && "Isn't a call instruction"); + + for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + + if (!MO.isGlobal()) + continue; + + const GlobalValue *GV = MO.getGlobal(); + GlobalValue::LinkageTypes LT = GV->getLinkage(); + if (GV->isInternalLinkage(LT) || GV->isPrivateLinkage(LT) || + (GV->isExternalLinkage(LT) && !GV->isDeclaration())) + return true; + + return false; + } + return false; +} + +/// processBasicBlock - Loop over all of the instructions in the basic block, +/// inserting vzero upper instructions before function calls. +bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, + MachineBasicBlock &BB) { + bool Changed = false; + MBB = &BB; + + for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { + MachineInstr *MI = I; + DebugLoc dl = I->getDebugLoc(); + + // Insert a vzeroupper instruction before each control transfer + // to functions outside this module + if (MI->getDesc().isCall() && !isCallToModuleFn(MI)) { + BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER)); + ++NumVZU; + } + } + + return Changed; +} |